pacman::p_load(psych, xray , ggplot2, texreg, DT, wrapr, dplyr,
sjmisc, sjlabelled, sjstats, sjPlot, tidyr,ggpubr,corrplot,
knitr, kableExtra, captioner, car, excelR,ggcorrplot,Rmisc,sjmisc,funModeling,
forcats, hrbrthemes,tidyverse,finalfit,patchwork,GGally,broom,caret,matrixTests,PerformanceAnalytics, pROC, rattle, finalfit)
df = read.csv("Well_being_Health_Behavior_Environmental_Factors_Dataset.csv")
pacman::p_load("DT")
head(x = df,n=5) %>% datatable(rownames=TRUE,filter = "top", options=list(pageLength = 10, scrollX=T))
pacman::p_load("DT")
tail(x = df,n=5) %>% datatable(rownames=TRUE,filter = "top", options=list(pageLength = 10, scrollX=T))
pacman::p_load("DT","psych")
headTail(x = df,top = 4,bottom = 4) %>% datatable(rownames=TRUE,filter = "top", options=list(pageLength = 10, scrollX=T,scrollY=T))
cat("The Dataset (df) contains ",nrow(df)," records and", ncol(df)," Columns or variables.")
## The Dataset (df) contains 395697 records and 33 Columns or variables.
str(df)
## 'data.frame': 395697 obs. of 33 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ age : int 30 64 49 43 68 38 23 61 78 82 ...
## $ wellbeing : int 0 0 0 0 0 0 0 0 0 0 ...
## $ SocialSupport : int 5 4 3 5 4 4 5 3 5 3 ...
## $ race : int 0 1 1 1 1 1 1 1 1 0 ...
## $ income : int 5 5 1 5 2 5 5 NA NA NA ...
## $ GeneralHealth : int 4 4 5 2 5 4 1 3 3 3 ...
## $ PoorPhysicalHealthDays: int 0 5 NA 0 30 0 0 0 4 0 ...
## $ PoorMentalHealthDays : int 0 0 0 0 3 0 0 0 0 0 ...
## $ Asthma : int 0 1 1 0 1 0 0 0 0 0 ...
## $ HealthInsurance : int 1 1 1 1 0 1 1 1 1 1 ...
## $ CVD : int 0 0 0 0 0 0 0 0 0 0 ...
## $ LimitedActivity : int 0 1 1 0 1 1 0 0 0 0 ...
## $ Diabetes : int 0 1 0 0 0 0 0 0 0 0 ...
## $ employ : int 1 7 8 1 8 1 1 7 7 5 ...
## $ BMI : int 3 3 3 2 1 2 2 3 2 1 ...
## $ HeavyDrinker : int 0 0 0 1 0 0 0 0 0 0 ...
## $ CurrentSmoker : int 0 0 1 0 0 0 0 0 0 0 ...
## $ PoorSleepDays : int 1 30 0 0 14 30 0 2 0 2 ...
## $ PhysicalActivity : int 1 0 0 1 0 0 1 1 1 0 ...
## $ marital : int 1 1 0 1 1 1 1 1 1 0 ...
## $ PrematureMortality : num 440 440 440 440 440 ...
## $ HouseholdIncome : int 48863 48863 48863 48863 48863 48863 48863 48863 48863 48863 ...
## $ ParkAccess : int 20 20 20 20 20 20 20 20 20 20 ...
## $ CrimeRate : int 300 300 300 300 300 300 300 300 300 300 ...
## $ UnemploymentRate : num 8 8 8 8 8 8 8 8 8 8 ...
## $ WaterSafety : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HighSchoolRate : int 80 80 80 80 80 80 80 80 80 80 ...
## $ SomeCollegeRate : num 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 ...
## $ AccessToRecFacility : num 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 ...
## $ FastFoodPercentage : int 52 52 52 52 52 52 52 52 52 52 ...
## $ PM2.5 : num 13.1 13.1 13.1 13.1 13.1 ...
## $ PopulationDensity : num 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 ...
The Dataset originally has only numerical and integer
values
Our Target variable “wellbeing” is also an integer datatype.
However, we expect that into a category as we conceive it as
classification thing of Wellbeing in the form of category as (yes or no)
or (good or bad).
2.Action
: Convert “Wellbeing” into a factor
It is also evident by looking at the structure of the dataset
that several other variables like (Asthma, HealthInsurance, CVD,
Diabetes, HeavyDrinker) to name a few are supposed to be categories or
factors but infact they are integers.
3.Action : Convert those set of
Categorical Variables as Factors from Integers.
Some set of variables are of the perfect type for example - Age is an integer and is supposed to be one. PM2.5 which represents the Fine Particle Matter in micrograms/cubic meter is actually a numerical data in a continous form.
Some Variables such as BMI and income were expected to be continuous. However, they are ordinal values mentioned in integers which is a point of concern
5. Action : Convert those variables into probable factors whenver needed in Analytics
summary(df)
## X age wellbeing SocialSupport
## Min. : 1 Min. : 7.00 Min. :0.000 Min. :1.000
## 1st Qu.: 98925 1st Qu.:45.00 1st Qu.:0.000 1st Qu.:4.000
## Median :197849 Median :57.00 Median :0.000 Median :5.000
## Mean :197849 Mean :56.32 Mean :0.055 Mean :4.181
## 3rd Qu.:296773 3rd Qu.:69.00 3rd Qu.:0.000 3rd Qu.:5.000
## Max. :395697 Max. :99.00 Max. :1.000 Max. :5.000
## NA's :17025 NA's :20998
## race income GeneralHealth PoorPhysicalHealthDays
## Min. :0.000 Min. :1.00 Min. :1.000 Min. : 0.000
## 1st Qu.:1.000 1st Qu.:2.00 1st Qu.:2.000 1st Qu.: 0.000
## Median :1.000 Median :4.00 Median :3.000 Median : 0.000
## Mean :0.806 Mean :3.61 Mean :2.586 Mean : 4.472
## 3rd Qu.:1.000 3rd Qu.:5.00 3rd Qu.:3.000 3rd Qu.: 3.000
## Max. :1.000 Max. :5.00 Max. :5.000 Max. :30.000
## NA's :5309 NA's :54657 NA's :1555 NA's :9341
## PoorMentalHealthDays Asthma HealthInsurance CVD
## Min. : 0.000 Min. :0.0000 Min. :0.0000 Min. :0.000
## 1st Qu.: 0.000 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.000
## Median : 0.000 Median :0.0000 Median :1.0000 Median :0.000
## Mean : 3.497 Mean :0.0936 Mean :0.8955 Mean :0.067
## 3rd Qu.: 2.000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.000
## Max. :30.000 Max. :1.0000 Max. :1.0000 Max. :1.000
## NA's :7361 NA's :2590 NA's :991 NA's :3928
## LimitedActivity Diabetes employ BMI
## Min. :0.0000 Min. :0.0000 Min. :1.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:1.000
## Median :0.0000 Median :0.0000 Median :3.000 Median :2.000
## Mean :0.2723 Mean :0.1274 Mean :3.885 Mean :1.933
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:7.000 3rd Qu.:3.000
## Max. :1.0000 Max. :1.0000 Max. :8.000 Max. :3.000
## NA's :1858 NA's :395 NA's :1322 NA's :17290
## HeavyDrinker CurrentSmoker PoorSleepDays PhysicalActivity
## Min. :0.000 Min. :0.0000 Min. : 0.000 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.: 0.000 1st Qu.:0.0000
## Median :0.000 Median :0.0000 Median : 3.000 Median :1.0000
## Mean :0.046 Mean :0.1583 Mean : 7.707 Mean :0.7303
## 3rd Qu.:0.000 3rd Qu.:0.0000 3rd Qu.:12.000 3rd Qu.:1.0000
## Max. :1.000 Max. :1.0000 Max. :30.000 Max. :1.0000
## NA's :10742 NA's :2463 NA's :7560 NA's :533
## marital PrematureMortality HouseholdIncome ParkAccess
## Min. :0.0000 Min. :133.5 Min. : 22289 Min. : 1.00
## 1st Qu.:0.0000 1st Qu.:294.0 1st Qu.: 41103 1st Qu.: 13.00
## Median :1.0000 Median :345.0 Median : 47930 Median : 32.00
## Mean :0.5552 Mean :355.8 Mean : 50177 Mean : 34.58
## 3rd Qu.:1.0000 3rd Qu.:405.7 3rd Qu.: 56166 3rd Qu.: 52.00
## Max. :1.0000 Max. :883.5 Max. :119525 Max. :100.00
## NA's :1559 NA's :9633
## CrimeRate UnemploymentRate WaterSafety HighSchoolRate
## Min. : 12.0 Min. : 1.100 Min. : 0.000 Min. : 27.0
## 1st Qu.: 202.0 1st Qu.: 7.000 1st Qu.: 0.000 1st Qu.: 74.0
## Median : 343.0 Median : 8.400 Median : 1.000 Median : 80.0
## Mean : 405.6 Mean : 8.716 Mean : 6.511 Mean : 79.1
## 3rd Qu.: 558.0 3rd Qu.:10.200 3rd Qu.: 5.000 3rd Qu.: 86.0
## Max. :2062.0 Max. :29.700 Max. :100.000 Max. :100.0
## NA's :4174 NA's :5039 NA's :16
## SomeCollegeRate AccessToRecFacility FastFoodPercentage PM2.5
## Min. :23.20 Min. : 0.00 Min. : 8.00 Min. : 6.50
## 1st Qu.:54.00 1st Qu.: 6.90 1st Qu.: 44.00 1st Qu.:10.86
## Median :61.20 Median : 9.80 Median : 50.00 Median :11.74
## Mean :60.45 Mean :10.13 Mean : 48.33 Mean :11.63
## 3rd Qu.:67.70 3rd Qu.:13.00 3rd Qu.: 54.00 3rd Qu.:12.78
## Max. :90.40 Max. :57.50 Max. :100.00 Max. :14.78
## NA's :42
## PopulationDensity
## Min. : 1.7
## 1st Qu.: 66.3
## Median : 276.9
## Mean : 1186.5
## 3rd Qu.: 991.3
## Max. :69468.4
##
The Summary provides a good support of evidence for those points
of concerns expressed earlier in the structure of the dataset. It
describes the standard statistics for those variables which are supposed
to be categorical variables either ordinal or nominal
The Summary indicates
something interesting that many records and most variables in the
dataset contains null values.
Action : Either Remove
or Impute Null Values ensuring that in each of the case does not
statistically significantly alters the dataset.
The Good part that the summary iterates is that the almost all variables are within their supposed range. For an instance, no age record is negative, No PM2.5 less than 0, Household income min is not negative and so on. This is a sigh of relief.
Below are some boxplots for concerning continuous and discrete variables demonstrating their ranges with outliers
This boxplot for age seems very healthy. In the original BRFSS dataset, ages 7 and 9 were designated for mising values.
boxplot(df["PrematureMortality"],main = "Premature Mortality Box Plot", xlab = "Premature Mortality",horizontal = TRUE)
The distribution of the Premature Mortality have some outliers in the right ends shows that it is skewed to the right.
pacman::p_load("ggplot2")
ggplot(data=df,aes(x=PrematureMortality)) + geom_density()+
ggtitle ("Premature Mortality Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) )
The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.
From the above test, we could possibly reject the null hypothesis that the distribution of premature Mortality is normal. However through Central Limit Theorem I shall prove the normality.
s30 <- c()
s50 <- c()
s500 <- c()
n =396000
for ( i in 1:n){
s30[i] = mean(sample(df$PrematureMortality,30, replace = TRUE))
s50[i] = mean(sample(df$PrematureMortality,50, replace = TRUE))
s500[i] = mean(sample(df$PrematureMortality,500, replace = TRUE))
}
par(mfrow=c(1,3))
hist(s30, col ="lightblue",main="Sample size=30",xlab ="Premature Mortality")
abline(v = mean(s30), col = "red")
hist(s50, col ="lightgreen", main="Sample size=50",xlab ="Premature Mortality")
abline(v = mean(s50), col = "red")
hist(s500, col ="orange",main="Sample size=500",xlab ="Premature Mortality")
abline(v = mean(s500), col = "red")
We could clearly see a bell curve with Central Limit Theorem and prove that with sufficiently large sample size any distribution that does not seem to be normally distributed gets normally distributed using Central Limit Theorem or Bootstrapping.
boxplot(df["HouseholdIncome"],main = "Household Income Box Plot", xlab = "Household Income",horizontal = TRUE)
The distribution of the Household Income have some outliers in the right
ends shows that it is skewed to the right.
pacman::p_load("ggplot2")
ggplot(data=df,aes(x=HouseholdIncome)) + geom_density()+
ggtitle ("Household Income Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) )
The distribution does not seems normal but I suspect that when we apply
the CLT or Bootstrapping Normality would be proven.
So from now on we shall not Check with the CLT or Bootstrapping for the normal distribution of any variable rather assume that the distribution will be normal whenever viewed from the lens of CLT or Bootstrapping.
boxplot(df["UnemploymentRate"],main = "Unemployment Rate Box Plot", xlab = "Unemployment Rate ",horizontal = TRUE)
The distribution of the Unemployment Rate have some outliers on both
ends more probably on the right which signifies that it is skewed more
to the right.
pacman::p_load("ggplot2")
ggplot(data=df,aes(x=UnemploymentRate)) + geom_density()+
ggtitle ("Unemployment Rate Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) )
The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.
boxplot(df["HighSchoolRate"],main = "High School Rate Box Plot", xlab = "High School Rate",horizontal = TRUE)
The distribution of the High School Rate have some outliers on left ends
which signifies that it is skewed more to the left.
pacman::p_load("ggplot2")
ggplot(data=df,aes(x=HighSchoolRate)) + geom_density()+
ggtitle ("High School Rate Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) )
## Warning: Removed 16 rows containing non-finite values (`stat_density()`).
The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.
boxplot(df["SomeCollegeRate"],main = "Some College Rate Box Plot", xlab = "Some College Rate ",horizontal = TRUE)
The distribution of the Some College Rate have some outliers on left
ends which signifies that it is skewed more to the left.
pacman::p_load("ggplot2")
ggplot(data=df,aes(x=SomeCollegeRate)) + geom_density()+
ggtitle ("Some College Rate Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) )
The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.
boxplot(df["AccessToRecFacility"],main = "Access To Recreational Facility Box Plot", xlab = "Access To Recreational Facility ",horizontal = TRUE)
The distribution of the Access To Recreational Facility have some
outliers on right ends which signifies that it is skewed more to the
left.
pacman::p_load("ggplot2")
ggplot(data=df,aes(x=AccessToRecFacility)) + geom_density()+
ggtitle ("Access To Recreational Facility Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) )
The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.
boxplot(df["FastFoodPercentage"],main = "Fast Food Percentage Box Plot", xlab = "Fast Food Percentage ",horizontal = TRUE)
The distribution of the Fast Food Percentage have some outliers on both
ends
pacman::p_load("ggplot2")
ggplot(data=df,aes(x=FastFoodPercentage)) + geom_density()+
ggtitle ("Fast Food Percentage Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) )
## Warning: Removed 42 rows containing non-finite values (`stat_density()`).
The distribution seems normal with multiple peaks and higher kurtosis
boxplot(df["PM2.5"],main = "PM2.5 Box Plot", xlab = "PM2.5 ",horizontal = TRUE)
The distribution of the PM2.5 have some outliers on left ends which
signifies that it is skewed more to the left.
pacman::p_load("ggplot2")
ggplot(data=df,aes(x=PM2.5)) + geom_density()+
ggtitle ("PM2.5 Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) )
The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.
boxplot(df["PopulationDensity"],main = "Population Density Box Plot", xlab = "Population Density ",horizontal = TRUE)
The distribution of the Population Density have some outliers on right
ends which signifies that it is skewed more to the right
pacman::p_load("ggplot2")
ggplot(data=df,aes(x=PopulationDensity)) + geom_density()+
ggtitle ("Population Density Density Plot") +
theme (axis.title.x=element_text (size=16, face="bold", colour="blue")) +
theme (axis.text.x=element_text (size=14 ) )
The distribution does not seems normal but I suspect that when we apply the CLT or Bootstrapping Normality would be proven.
As far as continuous variables are concerned their
distribution is quite questionable however, with Central Limit Theorem
it could be deduced that they are representative of their population.
Also the boxplot help us indicates that the variables are in their
expected range and have acceptable set of outliers and shall help us
validate the min, max, median and range of those variables.
# remove missing age values
df <- df %>%
mutate(age = na_if(age, 7)) %>%
mutate(age = na_if(age, 9))
apply(apply(df,2,is.na),2,sum)
## X age wellbeing
## 0 3364 17025
## SocialSupport race income
## 20998 5309 54657
## GeneralHealth PoorPhysicalHealthDays PoorMentalHealthDays
## 1555 9341 7361
## Asthma HealthInsurance CVD
## 2590 991 3928
## LimitedActivity Diabetes employ
## 1858 395 1322
## BMI HeavyDrinker CurrentSmoker
## 17290 10742 2463
## PoorSleepDays PhysicalActivity marital
## 7560 533 1559
## PrematureMortality HouseholdIncome ParkAccess
## 0 0 9633
## CrimeRate UnemploymentRate WaterSafety
## 4174 0 5039
## HighSchoolRate SomeCollegeRate AccessToRecFacility
## 16 0 0
## FastFoodPercentage PM2.5 PopulationDensity
## 42 0 0
library(tidyr)
pacman::p_load(inspectdf)
df %>% inspect_na %>% show_plot
df[,c(2:21,24,25,27,28,31)] %>% inspect_na %>% show_plot +
ggtitle("Prevalence of Missing Values in the Dataset") +
ylab("% of variable that is NA") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, vjust=1, hjust=1))
sum(apply(df,1,anyNA))
## [1] 120094
cat("There are around ",sum(apply(df,1,anyNA)), " null records in the dataset of in total", nrow(df),"records \n","In case we delete all of the", sum(apply(df,1,anyNA)),"records with missing values then we are actually losing around", 100*(sum(apply(df,1,anyNA))/nrow(df)),"% of the total dataset.")
## There are around 120094 null records in the dataset of in total 395697 records
## In case we delete all of the 120094 records with missing values then we are actually losing around 30.34999 % of the total dataset.
df.omit <- na.omit(df)
pacman::p_load(psych,DT)
headTail(df.omit, 10, 10) %>% datatable( rownames = FALSE, filter="top", options = list(pageLength = 21, scrollX=T))
cat(nrow(df.omit),ncol(df.omit))
## 275603 33
#write.csv(df.omit,file="df.omit.csv")
# Welch T-Test
# numeric variables names
nvar_clean <- colnames(df.omit)[c(2,22:33)]
# test difference in means
mean_tt_omit <- function(a) {
col_t_welch(df[a], df.omit[a])
}
# create means summary table
ar_omit <- array(0, dim=c(13,17)) # create empty array 13 rows, 18 columns
ar_omit <- as.data.frame(ar_omit) # convert to dataframe
for (i in c(2,22:33)){
ar_omit[i,1:17] <- mean_tt_omit(i) # add results of welch t-test
rownames(ar_omit)[i] <- colnames(df.omit)[i] # add variable name
}
# change column names
colnames(ar_omit)[c(1:2, 4:6, 12, 13:14)] <- c("original n", "omit NAs n", "original mean", "omit NAs mean", "mean_diff", "p-value", "lower-CI", "upper-CI")
# print table
ar_omit[nvar_clean, c(1:2, 4:6, 12, 13:14)] %>%
datatable(caption="Summary table of difference in means between original and cleaned dataset", rownames=TRUE, options=list(pageLength=13,scrollX=T)) %>%
formatRound(c(3:5, 7:8), digits=2) %>%
formatRound(6, digits=5)
fvar_clean <- c(3,5,10:14,17:18,20:21)
chi_test <- function(a){
chisq.test(table(df[,a]), p=table(df.omit[,a])/nrow(df.omit))
}
# create summary table
ar <- array(0, dim=c(11,9)) # create empty array 14 rows, 9 columns
ar <- as.data.frame(ar) # convert to dataframe
for (i in fvar_clean){
ar[i,1:9] <- chi_test(i) # add results of chi-square test
rownames(ar)[i] <- colnames(df)[i] # add variable name
}
colnames(ar)[c(1,3)] <- c("Chi-square", "p-value") # change column names
# print table
ar[fvar_clean, c(1,3)] %>%
datatable(caption="Summary table of Chi-squared test on factor variables", rownames=TRUE, options=list(pageLength=14,scrollX=T)) %>%
formatRound(columns=c(1:2), digits=6)
Due to large Sample size everything starts to make sense which can be seen by the p-value. Even after it shows statistically significant difference in the dataset before and after deleting null values. We tend to ignore the statistical evidence and stick to the fact that we have quite enough sample size to represent the population as a whole.
str(df.omit)
## 'data.frame': 275603 obs. of 33 variables:
## $ X : int 1 2 4 5 6 7 12 13 14 15 ...
## $ age : int 30 64 43 68 38 23 44 64 75 70 ...
## $ wellbeing : int 0 0 0 0 0 0 0 0 0 0 ...
## $ SocialSupport : int 5 4 5 4 4 5 4 3 1 5 ...
## $ race : int 0 1 1 1 1 1 1 0 1 1 ...
## $ income : int 5 5 5 2 5 5 5 3 3 4 ...
## $ GeneralHealth : int 4 4 2 5 4 1 5 4 1 2 ...
## $ PoorPhysicalHealthDays: int 0 5 0 30 0 0 21 3 0 0 ...
## $ PoorMentalHealthDays : int 0 0 0 3 0 0 14 2 0 0 ...
## $ Asthma : int 0 1 0 1 0 0 1 0 0 0 ...
## $ HealthInsurance : int 1 1 1 0 1 1 1 1 1 1 ...
## $ CVD : int 0 0 0 0 0 0 0 0 0 0 ...
## $ LimitedActivity : int 0 1 0 1 1 0 1 0 0 0 ...
## $ Diabetes : int 0 1 0 0 0 0 0 0 0 0 ...
## $ employ : int 1 7 1 8 1 1 7 7 7 7 ...
## $ BMI : int 3 3 2 1 2 2 2 2 2 2 ...
## $ HeavyDrinker : int 0 0 1 0 0 0 0 0 0 0 ...
## $ CurrentSmoker : int 0 0 0 0 0 0 0 0 1 0 ...
## $ PoorSleepDays : int 1 30 0 14 30 0 15 10 0 15 ...
## $ PhysicalActivity : int 1 0 1 0 0 1 0 1 1 1 ...
## $ marital : int 1 1 1 1 1 1 0 1 0 0 ...
## $ PrematureMortality : num 440 440 440 440 440 ...
## $ HouseholdIncome : int 48863 48863 48863 48863 48863 48863 48863 48863 48863 48863 ...
## $ ParkAccess : int 20 20 20 20 20 20 20 20 20 20 ...
## $ CrimeRate : int 300 300 300 300 300 300 300 300 300 300 ...
## $ UnemploymentRate : num 8 8 8 8 8 8 8 8 8 8 ...
## $ WaterSafety : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HighSchoolRate : int 80 80 80 80 80 80 80 80 80 80 ...
## $ SomeCollegeRate : num 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 ...
## $ AccessToRecFacility : num 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 ...
## $ FastFoodPercentage : int 52 52 52 52 52 52 52 52 52 52 ...
## $ PM2.5 : num 13.1 13.1 13.1 13.1 13.1 ...
## $ PopulationDensity : num 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 ...
## - attr(*, "na.action")= 'omit' Named int [1:120094] 3 8 9 10 11 22 26 27 29 32 ...
## ..- attr(*, "names")= chr [1:120094] "3" "8" "9" "10" ...
Factorizing
variables_recoded = c("wellbeing","SocialSupport","race","income","GeneralHealth","Asthma","HealthInsurance","CVD","LimitedActivity","Diabetes","employ","BMI","HeavyDrinker","CurrentSmoker","PhysicalActivity","marital")
for (i in variables_recoded){
df.omit[,i] <- as.factor(df.omit[,i])
}
Checking the Status
str(df.omit)
## 'data.frame': 275603 obs. of 33 variables:
## $ X : int 1 2 4 5 6 7 12 13 14 15 ...
## $ age : int 30 64 43 68 38 23 44 64 75 70 ...
## $ wellbeing : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SocialSupport : Factor w/ 5 levels "1","2","3","4",..: 5 4 5 4 4 5 4 3 1 5 ...
## $ race : Factor w/ 2 levels "0","1": 1 2 2 2 2 2 2 1 2 2 ...
## $ income : Factor w/ 5 levels "1","2","3","4",..: 5 5 5 2 5 5 5 3 3 4 ...
## $ GeneralHealth : Factor w/ 5 levels "1","2","3","4",..: 4 4 2 5 4 1 5 4 1 2 ...
## $ PoorPhysicalHealthDays: int 0 5 0 30 0 0 21 3 0 0 ...
## $ PoorMentalHealthDays : int 0 0 0 3 0 0 14 2 0 0 ...
## $ Asthma : Factor w/ 2 levels "0","1": 1 2 1 2 1 1 2 1 1 1 ...
## $ HealthInsurance : Factor w/ 2 levels "0","1": 2 2 2 1 2 2 2 2 2 2 ...
## $ CVD : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ LimitedActivity : Factor w/ 2 levels "0","1": 1 2 1 2 2 1 2 1 1 1 ...
## $ Diabetes : Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
## $ employ : Factor w/ 8 levels "1","2","3","4",..: 1 7 1 8 1 1 7 7 7 7 ...
## $ BMI : Factor w/ 3 levels "1","2","3": 3 3 2 1 2 2 2 2 2 2 ...
## $ HeavyDrinker : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 1 1 1 ...
## $ CurrentSmoker : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 2 1 ...
## $ PoorSleepDays : int 1 30 0 14 30 0 15 10 0 15 ...
## $ PhysicalActivity : Factor w/ 2 levels "0","1": 2 1 2 1 1 2 1 2 2 2 ...
## $ marital : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 1 2 1 1 ...
## $ PrematureMortality : num 440 440 440 440 440 ...
## $ HouseholdIncome : int 48863 48863 48863 48863 48863 48863 48863 48863 48863 48863 ...
## $ ParkAccess : int 20 20 20 20 20 20 20 20 20 20 ...
## $ CrimeRate : int 300 300 300 300 300 300 300 300 300 300 ...
## $ UnemploymentRate : num 8 8 8 8 8 8 8 8 8 8 ...
## $ WaterSafety : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HighSchoolRate : int 80 80 80 80 80 80 80 80 80 80 ...
## $ SomeCollegeRate : num 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 54.4 ...
## $ AccessToRecFacility : num 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 7.3 ...
## $ FastFoodPercentage : int 52 52 52 52 52 52 52 52 52 52 ...
## $ PM2.5 : num 13.1 13.1 13.1 13.1 13.1 ...
## $ PopulationDensity : num 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 91.8 ...
## - attr(*, "na.action")= 'omit' Named int [1:120094] 3 8 9 10 11 22 26 27 29 32 ...
## ..- attr(*, "names")= chr [1:120094] "3" "8" "9" "10" ...
df.omit <- within(df.omit, {
wellbeing <- Recode(wellbeing, '"1" = "Poor_Wellbeing"; "0" = "Good_Wellbeing"', as.factor=TRUE, to.value="=", interval=":",
separator=";")
})
df.omit <- within(df.omit, {
CurrentSmoker <- Recode(CurrentSmoker, '"1" = "Current Smoker"; "0" = "Not a Current Smoker"; ;', as.factor=TRUE, to.value="=",
interval=":", separator=";")
})
df.omit <- within(df.omit, {
HeavyDrinker <- Recode(HeavyDrinker, '"1" = "Heavy Drinker"; "0" = "Not a Heavy Drinker"; ; ;', as.factor=TRUE, to.value="=",
interval=":", separator=";")
})
df.omit <- within(df.omit, {
HealthInsurance <- Recode(HealthInsurance , '"1" = "With Health Coverage"; "0" = "No Health Coverage"', as.factor=TRUE, to.value="=", interval=":",
separator=";")
})
df.omit <- within(df.omit, {
CVD <- Recode(CVD, '"1" = "Cardiovascular Disease"; "0" = "No Cardiovascular Disease"; ;', as.factor=TRUE, to.value="=",
interval=":", separator=";")
})
df.omit <- within(df.omit, {
Diabetes <- Recode(Diabetes, '"1" = "Diabetic"; "0" = "Non Diabetic"; ; ;', as.factor=TRUE, to.value="=",
interval=":", separator=";")
})
df.omit <- within(df.omit, {
LimitedActivity <- Recode(LimitedActivity , '"1" = "Limited"; "0" = "Not Limited"; ; ;', as.factor=TRUE, to.value="=",
interval=":", separator=";")
})
df.omit <- within(df.omit, {
PhysicalActivity <- Recode(PhysicalActivity , '"1" = "Physically Active"; "0" = "Not Physically Active"; ; ;', as.factor=TRUE, to.value="=",
interval=":", separator=";")
})
df.omit <- within(df.omit, {
employ <- Recode(employ , '"1" = "Waged Employement"; "2" = "Self-Employed";"3" = "Unemployed for more than 1 yr" ;"4" = "Unemployed for less than 1 yr";"5" = "Homemaker";"6" = "Student";"7" = "Retired";"8" = "Unable to work" ;', as.factor=TRUE, to.value="=",
interval=":", separator=";")
})
df.omit <- within(df.omit, {
BMI <- Recode(BMI , '"1" = "Healthy weight"; "2" = "Overweight";"3" = "Obese" ;;;', as.factor=TRUE, to.value="=",
interval=":", separator=";")
})
df.omit$income <- recode_factor(df.omit$income,
"1" = "less than 15000",
"2" = "[15000-25000)",
"3" = "[25000-35000)",
"4" = "[35000-50000)",
"5" = "50000 or more")
df.omit$Asthma <- recode_factor(df.omit$Asthma, '0' = "No Asthma", '1' = "Asthma")
df.omit$SocialSupport <- recode_factor(df.omit$SocialSupport, '1' = "Never", '2' = " Rarely", '3' = " Sometimes", '4' = "Usually", '5' = "Always")
df.omit$race <- recode_factor(df.omit$race, '0' = "Other races", '1' = "White")
df.omit$GeneralHealth <- recode_factor(df.omit$GeneralHealth, '1' = "Excellent",
'2' = "Very good",
'3' = "Good",
'4' = "Fair",
'5' = "Poor")
df.omit$marital <- recode_factor(df.omit$marital, '0' = "Not married", '1' = "Married")
# write.csv(df.omit,"Final_Furbished.csv",row.names = FALSE)
df_final = read.csv("Final_Furbished.csv")
variables_recoded = c("wellbeing","SocialSupport","race","income","GeneralHealth","Asthma","HealthInsurance","CVD","LimitedActivity","Diabetes","employ","BMI","HeavyDrinker","CurrentSmoker","PhysicalActivity","marital")
for (i in variables_recoded){
df_final[,i] <- as.factor(df_final[,i])
}
summary(df_final)
## X age wellbeing SocialSupport
## Min. : 1 Min. :18.00 Good_Wellbeing:260883 Rarely : 8630
## 1st Qu.: 98517 1st Qu.:44.00 Poor_Wellbeing: 14720 Sometimes: 29706
## Median :196815 Median :56.00 Always :142109
## Mean :197112 Mean :55.55 Never : 12615
## 3rd Qu.:295444 3rd Qu.:67.00 Usually : 82543
## Max. :395697 Max. :99.00
##
## race income GeneralHealth
## Other races: 49755 [15000-25000) : 45957 Excellent:52300
## White :225848 [25000-35000) : 32151 Fair :33709
## [35000-50000) : 41985 Good :81136
## 50000 or more :127154 Poor :14478
## less than 15000: 28356 Very good:93980
##
##
## PoorPhysicalHealthDays PoorMentalHealthDays Asthma
## Min. : 0.000 Min. : 0.000 Asthma : 25592
## 1st Qu.: 0.000 1st Qu.: 0.000 No Asthma:250011
## Median : 0.000 Median : 0.000
## Mean : 4.202 Mean : 3.437
## 3rd Qu.: 3.000 3rd Qu.: 2.000
## Max. :30.000 Max. :30.000
##
## HealthInsurance CVD
## No Health Coverage : 27354 Cardiovascular Disease : 17940
## With Health Coverage:248249 No Cardiovascular Disease:257663
##
##
##
##
##
## LimitedActivity Diabetes
## Limited : 72270 Diabetic : 32752
## Not Limited:203333 Non Diabetic:242851
##
##
##
##
##
## employ BMI
## Waged Employement :121285 Healthy weight: 94336
## Retired : 74523 Obese : 79608
## Self-Employed : 23118 Overweight :101659
## Homemaker : 18449
## Unable to work : 17507
## Unemployed for more than 1 yr: 8814
## (Other) : 11907
## HeavyDrinker CurrentSmoker PoorSleepDays
## Heavy Drinker : 14131 Current Smoker : 44088 Min. : 0.000
## Not a Heavy Drinker:261472 Not a Current Smoker:231515 1st Qu.: 0.000
## Median : 3.000
## Mean : 7.772
## 3rd Qu.:12.000
## Max. :30.000
##
## PhysicalActivity marital PrematureMortality
## Not Physically Active: 68341 Married :159034 Min. :133.5
## Physically Active :207262 Not married:116569 1st Qu.:290.6
## Median :341.3
## Mean :350.7
## 3rd Qu.:400.2
## Max. :883.5
##
## HouseholdIncome ParkAccess CrimeRate UnemploymentRate
## Min. : 23751 Min. : 1.00 Min. : 12.0 Min. : 1.100
## 1st Qu.: 41796 1st Qu.: 13.00 1st Qu.: 200.0 1st Qu.: 6.900
## Median : 48622 Median : 33.00 Median : 343.0 Median : 8.400
## Mean : 50813 Mean : 34.76 Mean : 397.7 Mean : 8.625
## 3rd Qu.: 56750 3rd Qu.: 53.00 3rd Qu.: 544.0 3rd Qu.:10.200
## Max. :119525 Max. :100.00 Max. :1627.0 Max. :29.700
##
## WaterSafety HighSchoolRate SomeCollegeRate AccessToRecFacility
## Min. : 0.000 Min. : 27.00 Min. :24.20 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.: 74.00 1st Qu.:54.70 1st Qu.: 7.20
## Median : 1.000 Median : 81.00 Median :61.90 Median : 9.90
## Mean : 6.427 Mean : 79.36 Mean :61.13 Mean :10.34
## 3rd Qu.: 5.000 3rd Qu.: 86.00 3rd Qu.:68.30 3rd Qu.:13.00
## Max. :100.000 Max. :100.00 Max. :90.40 Max. :51.60
##
## FastFoodPercentage PM2.5 PopulationDensity
## Min. : 8.00 Min. : 6.50 Min. : 1.7
## 1st Qu.: 44.00 1st Qu.:10.83 1st Qu.: 69.8
## Median : 50.00 Median :11.69 Median : 290.2
## Mean : 48.22 Mean :11.59 Mean : 1033.7
## 3rd Qu.: 54.00 3rd Qu.:12.78 3rd Qu.: 990.8
## Max. :100.00 Max. :14.78 Max. :69468.4
##
psych::describe(df_final, skew=FALSE, ranges=TRUE, quant=NULL, IQR=FALSE) %>%
datatable(rownames=TRUE,
options=list(pageLength=10,scrollX=T)) %>%
formatRound(c(3:4,8), 2)
my_colors <- c("#25ADB4","#777777")
my_colors2 <- c("#67d1d3","#777777")
mycolors2 <- c("#434343","#44c5c8","#67d1d3","#8ddcde","#baeaea")
my_color_palette <- c("#FF5656", "#1A78E0")
df_final %>%
ggplot(aes(x = wellbeing, fill = wellbeing)) + geom_bar(aes(y = after_stat(count)/sum(after_stat(count)))) + labs(title = "Wellbeing Distribution") + xlab("Wellbeing") + ylab("Percentage")
# add labels
df_final %>%
ggplot(aes(x = wellbeing, fill = wellbeing)) +
geom_bar() +
stat_count(aes(label = paste( round(prop.table(..count..), digits=3) * 100, "%", sep = "")),
vjust = -1, geom = "text", position = "identity", color ="black", size=5) +
labs(title = "Wellbeing Distribution") +
ylim(0,300000) +
xlab("Wellbeing") +
ylab("Number of Individuals") +
theme_bw() +
theme(legend.position = "none") +
scale_fill_manual(values=my_colors2)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
### Age
df_final$Age_group <- ifelse(df_final$age < 25, "Age 18 to 24",ifelse(df_final$age < 35, "Age 25 to 34",ifelse(df_final$age < 45, "Age 35 to 44", ifelse(df_final$age < 55, "Age 44 to 54",ifelse(df_final$age < 65, "Age 55 to 64", "Age 65 or older")))))
df_final %>%
ggplot(aes(x = reorder(Age_group, Age_group, length, decreasing = FALSE), fill = Age_group, angle = 45)) +
geom_bar(aes(y = (..count..)/sum(..count..))) + labs(title = "Age Group Distribution") + xlab("Age group") + ylab("Percentage") + theme(legend.position = "none",axis.text.x = element_text(angle = 45))
# add labels
df_final %>%
ggplot(aes(x = reorder(Age_group, Age_group, length, decreasing = FALSE), fill = Age_group)) +
geom_bar() +
stat_count(aes(label = paste( round(prop.table(..count..), digits=3) * 100, "%", sep = "")),
vjust = -1, geom = "text", position = "identity", color ="black", size=5) +
ylim(0,90000) +
labs(title = "Age Group Distribution") +
xlab("Age group") +
ylab("Number of Individuals") +
theme_bw() +
theme(legend.position = "none")
### Social Support
df_final %>%
ggplot(aes(x = SocialSupport, fill = SocialSupport)) + geom_bar() + geom_bar(aes(y = (..count..)/sum(..count..))) + labs(title = "Social Support Distribution") + xlab("Social Support") + ylab("Percentage")
### Income
df_final %>%
ggplot(aes(x = income, fill = income)) + geom_bar(aes(y = (..count..)/sum(..count..))) + labs(title = "Income Distribution") + xlab("Income") + ylab("Percentage") + theme(legend.position = "none",axis.text.x = element_text(angle = 45))
### Employment Status
# employment status
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
## The following objects are masked from 'package:psych':
##
## alpha, rescale
df_final %>%
ggplot(aes(x = reorder(employ, employ, length, decreasing = FALSE), fill = employ)) +
geom_bar() +
stat_count(aes(label = paste( round(prop.table(..count..), digits=3) * 100, "%", sep = "")),
vjust = -1, geom = "text", position = "identity", color ="black", size=5) +
ylim(0,150000) +
labs(title = "Employment Status Distribution") +
xlab("Employment Status") +
ylab("Number of Individuals") +
theme_bw() +
scale_x_discrete(labels = wrap_format(10)) +
theme(legend.position = "none")
df_final %>%
ggplot(aes(x = GeneralHealth, fill = GeneralHealth)) + geom_bar(aes(y = (..count..)/sum(..count..))) + labs(title = "General Health Distribution") + xlab("General Health Status") + ylab("Percentage") + theme(legend.position = "none",axis.text.x = element_text(angle = 45))
### Asthma
df_final %>%
ggplot(aes(x = Asthma, fill = Asthma)) + geom_bar(aes(y = (..count..)/sum(..count..))) + labs(title = "Asthma Distribution") + xlab("Asthma") + ylab("Percentage")
### Race
df_final %>%
ggplot(aes(x = race, fill = race)) + geom_bar(aes(y = (..count..)/sum(..count..))) + labs(title = "Race Distribution") + xlab("Race") + ylab("Percentage")
race_bar <- df_final %>%
ggplot(aes(x = race, fill = race)) +
geom_bar() +
stat_count(aes(label = paste( round(prop.table(..count..), digits=3) * 100, "%", sep = "")), vjust = -1, geom = "text", position = "identity", color ="black", size=5) +
ylim(0,260000) +
labs(title = "Race Distribution") +
xlab("Race") +
ylab("Percentage") +
theme_bw() +
theme(legend.position = "none") +
scale_fill_brewer(palette="Dark2")
race_bar
### Marital Status Distribution
# bar chart
df_final %>%
ggplot(aes(x = marital, fill = marital)) +
geom_bar() +
stat_count(aes(label = paste( round(prop.table(..count..), digits=3) * 100, "%", sep = "")), vjust = -1, geom = "text", position = "identity", color ="black", size=5) +
ylim(0,200000) +
labs(title = "Marital Status Distribution") +
xlab("Marital Status") +
ylab("Percentage") +
theme_bw() +
theme(legend.position = "none") +
scale_fill_brewer(palette = "Set1")
# age and race
age_race_plot <- df.omit %>%
ggdensity( x = "age" ,
add = "mean", rug = FALSE,
color = "race", fill = "race", palette = "Dark2", alpha=0.25) +
ggtitle("Density Graph of Age by Race") +
theme_bw() +
theme(legend.position="right")
age_race_plot
p_ra <- df_final %>%
ggplot(aes(x = Age_group, fill = race)) +
geom_bar(position = "fill") +
ylab("proportion") +
labs(title = "Age Group and Race Distribution") +
theme_bw() +
theme(legend.position="right") +
scale_fill_brewer(palette="Dark2")
p_ra
p3 <- df_final %>%
ggplot(aes(x = race, fill = wellbeing)) + geom_bar(position = "fill") + ylab("proportion") + labs(title = "Wellbeing status on different races")
p3
table(df_final$wellbeing, df_final$race)
##
## Other races White
## Good_Wellbeing 46493 214390
## Poor_Wellbeing 3262 11458
rx.p1 <- df_final %>%
ggplot(aes(x = race, fill = wellbeing)) +
geom_bar() +
theme(legend.position = "none")
rx.p2 <- df_final %>%
ggplot(aes(x = race, fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
library(patchwork)
rx.p1 + rx.p2
There seems to be minimal
visual difference in the percentages of the Poor and Good Wellbeing for
Race Non white and Hispanic and Race White and Hispanic
p2 <- df_final %>%
ggplot(aes(x = income, fill = wellbeing)) + geom_bar(position = "fill") + ylab("proportion") + theme(axis.text.x = element_text(angle = 30)) + labs(title = "Wellbeing status on different income status")
p2
#### Bin Median HouseholdIncome (county)
# bin mean household income to match individual income bins
df_final <- df_final %>%
mutate(HouseholdIncome.bin = cut(HouseholdIncome, breaks=c(0, 24999, 34999, 49999, 119525)))
df_final$HouseholdIncome.bin <- recode_factor(df_final$HouseholdIncome.bin, "(0,2.5e+04]" = "less than 25000", "(2.5e+04,3.5e+04]" = "[25000,35000)", "(3.5e+04,5e+04]" = "[35000,50000)","(5e+04,1.2e+05]" = "50000 or more")
frq(df.omit$HouseholdIncome.bin)
## NULL
tab_xtab(df_final$HouseholdIncome.bin, df_final$wellbeing, show.col.prc = TRUE, show.exp = TRUE)
| HouseholdIncome.bin | wellbeing | Total | |
|---|---|---|---|
| Good_Wellbeing | Poor_Wellbeing | ||
| less than 25000 |
203 203 0.1 % |
11 11 0.1 % |
214 214 0.1 % |
| [25000,35000) |
16170 16368 6.2 % |
1122 924 7.6 % |
17292 17292 6.3 % |
| [35000,50000) |
123944 124464 47.5 % |
7543 7023 51.2 % |
131487 131487 47.7 % |
| 50000 or more |
120566 119848 46.2 % |
6044 6762 41.1 % |
126610 126610 45.9 % |
| Total |
260883 260883 100 % |
14720 14720 100 % |
275603 275603 100 % |
χ2=166.368 · df=3 · Cramer’s V=0.025 · p=0.000 |
# compare againt individual income
df_final %>%
group_by(HouseholdIncome.bin) %>%
ggplot(aes(x = income, fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion") +
ggtitle("Wellbeing status by income level, grouped by county median household income") +
facet_wrap(~HouseholdIncome.bin, ncol=4) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, vjust=1, hjust=1)) +
scale_fill_manual(values=my_colors2)
# income and wellbeing
df_final %>%
ggplot(aes(x = income, fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion") +
ggtitle("Wellbeing status on different income levels, grouped by race") +
facet_wrap(~race) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, vjust=1, hjust=1)) +
theme(legend.position="bottom") +
scale_fill_manual(values=my_colors2)
p4 <- df_final %>%
ggplot(aes(x = Asthma, fill = wellbeing)) + geom_bar(position = "fill") + ylab("proportion") + labs(title = "Wellbeing status between people having or not Asthma")
p4
p5 <- df_final %>%
ggplot(aes(x = Age_group, fill = wellbeing)) + geom_bar(position = "fill") + ylab("proportion") + labs(title = "Wellbeing status on different age group") + theme(axis.text.x = element_text(angle = 0)) + theme_bw() + scale_fill_manual(values=my_colors2)
p5
p6 <- df_final %>%
ggplot(aes(x = GeneralHealth, fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion") +
labs(title = "Wellbeing status on different general health status") +
theme_bw() +
scale_fill_manual(values=my_colors2) +
theme(
legend.position="bottom",
axis.text.x = element_text(size=12, face="bold"),)
p6
#### Wellbeing, General Health, and Race
tab_xtab(df_final$GeneralHealth, df_final$wellbeing, show.cell.prc=TRUE)
| GeneralHealth | wellbeing | Total | |
|---|---|---|---|
| Good_Wellbeing | Poor_Wellbeing | ||
| Excellent |
51525 18.7 % |
775 0.3 % |
52300 19 % |
| Fair |
29680 10.8 % |
4029 1.5 % |
33709 12.3 % |
| Good |
77287 28 % |
3849 1.4 % |
81136 29.4 % |
| Poor |
10663 3.9 % |
3815 1.4 % |
14478 5.3 % |
| Very good |
91728 33.3 % |
2252 0.8 % |
93980 34.1 % |
| Total |
260883 94.7 % |
14720 5.3 % |
275603 100 % |
χ2=18764.085 · df=4 · Cramer’s V=0.261 · p=0.000 |
p6 + facet_wrap(~race)
p8 <- df_final %>%
ggplot(aes(x = CVD, fill = wellbeing)) + geom_bar(position = "fill") + ylab("proportion") + labs(title = "Wellbeing status on patient with and without CVD")
p8
table(df_final$wellbeing, df_final$CVD)
##
## Cardiovascular Disease No Cardiovascular Disease
## Good_Wellbeing 16318 244565
## Poor_Wellbeing 1622 13098
rx.p1 <- df_final %>%
ggplot(aes(x = CVD, fill = wellbeing)) +
geom_bar() +
theme(legend.position = "none")
rx.p2 <- df_final %>%
ggplot(aes(x = CVD, fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
library(patchwork)
rx.p1 + rx.p2
There seems to be visual
difference in the percentages of the Poor and Good Wellbeing for poeple
with and without CVD
table(df_final$wellbeing, df_final$HealthInsurance)
##
## No Health Coverage With Health Coverage
## Good_Wellbeing 24146 236737
## Poor_Wellbeing 3208 11512
x1 = df_final %>%
ggplot(aes(x = PhysicalActivity , fill = wellbeing)) +
geom_bar() +
theme(legend.position = "none")
x2 = df_final %>%
ggplot(aes(x = PhysicalActivity , fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion") +
xlab("Physical Activity in the Last 30 Days") +
ggtitle("Physical Activity or Exercise in the Last 30 Days") +
scale_x_discrete(labels = wrap_format(10)) +
theme_bw() +
scale_fill_manual(values=my_colors2) +
theme(legend.position = "none",
axis.text.x = element_text( size=14, face="bold"))
x1 + x2
smoke.p2 <- df_final %>%
ggplot(aes(x = CurrentSmoker, fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion") +
xlab("Current Smoking Status") +
ggtitle("Current Smoking Status") +
scale_x_discrete(labels = wrap_format(10)) +
theme_bw() +
scale_fill_manual(values=my_colors2) +
theme(legend.position = "none",
axis.text.x = element_text( size=14, face="bold"))
library(patchwork)
smoke.p2
table(df_final$LimitedActivity, df_final$wellbeing)
##
## Good_Wellbeing Poor_Wellbeing
## Limited 63039 9231
## Not Limited 197844 5489
df_final %>%
ggplot(aes(x = LimitedActivity , fill = wellbeing)) +
geom_bar() +
theme(legend.position = "none")
df_final %>%
ggplot(aes(x = LimitedActivity , fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
It could be confirmed visually
that with Low Physical Activitys there is a significant increase in
proportion of the Poor Wellbeing of individuals
limact_p <- df_final %>%
ggplot(aes(x = LimitedActivity , fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion") +
xlab("Limited activity due to health problems") +
ggtitle("Limited Activity due to Health Problems") +
theme_bw() +
scale_fill_manual(values=my_colors2) +
theme(legend.position = "none")
limact_p
table(df_final$wellbeing, df_final$Diabetes)
##
## Diabetic Non Diabetic
## Good_Wellbeing 30024 230859
## Poor_Wellbeing 2728 11992
rx.p1 <- df_final %>%
ggplot(aes(x = Diabetes, fill = wellbeing)) +
geom_bar() +
theme(legend.position = "none")
rx.p2 <- df_final %>%
ggplot(aes(x = Diabetes, fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
library(patchwork)
rx.p1 + rx.p2
There seems to be a visual
difference in the percentages of the Poor and Good Wellbeing between
Diabetic and Non Diabetic individuals.
table(df_final$wellbeing, df_final$employ)
##
## Homemaker Retired Self-Employed Student Unable to work
## Good_Wellbeing 17872 72112 22313 3822 13343
## Poor_Wellbeing 577 2411 805 229 4164
##
## Unemployed for less than 1 yr Unemployed for more than 1 yr
## Good_Wellbeing 6786 7285
## Poor_Wellbeing 1070 1529
##
## Waged Employement
## Good_Wellbeing 117350
## Poor_Wellbeing 3935
df_final %>%
ggplot(aes(x = employ, fill = wellbeing)) +
geom_bar() +
theme(legend.position = "none")
df_final %>%
ggplot(aes(x = employ, fill = wellbeing)) +
geom_bar(position = "fill") +
xlab("employment status") +
ylab("proportion") +
ggtitle("Wellbeing status by employment status") +
theme_bw() +
scale_x_discrete(labels = wrap_format(10)) +
scale_fill_manual(values=my_colors2) +
theme(legend.position = "top")
p_empa <- df_final %>%
ggplot(aes(x = employ, fill = wellbeing)) +
geom_bar(position = "fill") +
xlab("employment status") +
ylab("proportion") +
ggtitle("Wellbeing status by employment status, group by age") +
facet_wrap(~Age_group, ncol=2) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, vjust=1, hjust=1)) +
scale_fill_manual(values=my_colors2) +
theme(legend.position = "top")
p_empa
–> There seems to be
minimal visual difference in the percentages of the Poor and Good
Wellbeing for Homemaker, Retired and Waged Employment
–>
There is relatively higher proportions of Poor Wellbeing in students
than the prior-mentioned employment groups.
–> Individuals who
are unable to work or are unemployed (either more than a year or less
than a year) have significantly higher proportions of Poor Wellbeing
than any other employment status groups.
rx.p1 <- df_final %>%
ggplot(aes(x = HealthInsurance, fill = wellbeing)) +
geom_bar() +
ylab("number of individuals") +
theme_bw() +
scale_fill_manual(values=my_colors2) +
theme(legend.position = "none")
rx.p2 <- df_final %>%
ggplot(aes(x = HealthInsurance, fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion") +
theme_bw() +
scale_fill_manual(values=my_colors2)
library(patchwork)
rx.p1 + rx.p2 + ggtitle("Wellbeing status by health insurance")
table(df_final$wellbeing, df_final$BMI)
##
## Healthy weight Obese Overweight
## Good_Wellbeing 89792 73957 97134
## Poor_Wellbeing 4544 5651 4525
df_final %>%
ggplot(aes(x = BMI, fill = wellbeing)) +
geom_bar() +
theme(legend.position = "none")
df_final %>%
ggplot(aes(x = BMI, fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
Individuals with higher
BMI tend to have slightly higher proportions of “POOR” wellbeing.
table(df_final$PoorSleepDays, df_final$wellbeing)
##
## Good_Wellbeing Poor_Wellbeing
## 0 93145 2527
## 1 8051 164
## 2 21461 487
## 3 14905 416
## 4 9838 322
## 5 20899 733
## 6 3987 180
## 7 5813 308
## 8 2415 108
## 9 251 17
## 10 17910 1006
## 11 61 9
## 12 1624 115
## 13 88 8
## 14 1728 160
## 15 15792 1396
## 16 233 34
## 17 137 24
## 18 269 37
## 19 34 5
## 20 11946 1339
## 21 408 64
## 22 186 34
## 23 79 21
## 24 180 26
## 25 4405 632
## 26 240 36
## 27 294 40
## 28 1418 201
## 29 641 83
## 30 22445 4188
df_final %>%
ggplot(aes(x = PoorSleepDays , fill = wellbeing)) +
geom_bar() +
theme(legend.position = "none")
df_final %>%
ggplot(aes(x = PoorSleepDays , fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
It could be seen visually that
with increment in the Poor Sleep days there is a significant
increase in the Poor Wellbeing of indivduals.
table(df_final$PhysicalActivity, df_final$wellbeing)
##
## Good_Wellbeing Poor_Wellbeing
## Not Physically Active 61658 6683
## Physically Active 199225 8037
x1 = df_final %>%
ggplot(aes(x = PhysicalActivity , fill = wellbeing)) +
geom_bar() +
theme(legend.position = "none")
x2 =df_final %>%
ggplot(aes(x = PhysicalActivity , fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
library(patchwork)
x1+x2
It could be confirmed visually
that with Low Physical Activitys there is a significant increase in
proportion of the Poor Wellbeing of individuals
table(df_final$wellbeing, df_final$CurrentSmoker)
##
## Current Smoker Not a Current Smoker
## Good_Wellbeing 38848 222035
## Poor_Wellbeing 5240 9480
rx.p1 <- df_final %>%
ggplot(aes(x = CurrentSmoker, fill = wellbeing)) +
geom_bar() +
theme(legend.position = "none")
rx.p2 <- df_final %>%
ggplot(aes(x = CurrentSmoker, fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
library(patchwork)
rx.p1 + rx.p2
From the Visual Evidence, it
could be said that the Smoking Status affect the Wellbeing of an
individual. It could interpreted from the graph that more number of
current smokers possess poor wellbeing as compared to those who are not
a current smoker.
table(df_final$wellbeing, df_final$HeavyDrinker)
##
## Heavy Drinker Not a Heavy Drinker
## Good_Wellbeing 13172 247711
## Poor_Wellbeing 959 13761
rx.p1 <- df_final %>%
ggplot(aes(x = HeavyDrinker, fill = wellbeing)) +
geom_bar() +
theme(legend.position = "none")
rx.p2 <- df_final %>%
ggplot(aes(x = HeavyDrinker, fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
library(patchwork)
rx.p1 + rx.p2
There seems to be minimal
visual difference in the percentages of the Poor and Good Wellbeing for
Heavy Drinker and not a Heavy Drinker.
table(df_final$PoorPhysicalHealthDays, df_final$wellbeing)
##
## Good_Wellbeing Poor_Wellbeing
## 0 169695 4619
## 1 12060 417
## 2 15411 733
## 3 8575 544
## 4 4536 291
## 5 7890 551
## 6 1244 113
## 7 4433 351
## 8 775 61
## 9 142 22
## 10 5668 561
## 11 48 6
## 12 508 79
## 13 58 6
## 14 2460 216
## 15 4750 708
## 16 109 35
## 17 68 21
## 18 132 33
## 19 20 8
## 20 2962 591
## 21 573 70
## 22 67 13
## 23 44 9
## 24 51 17
## 25 1193 294
## 26 64 7
## 27 92 25
## 28 435 128
## 29 172 71
## 30 16648 4120
df_final %>%
ggplot(aes(x = PoorPhysicalHealthDays , fill = wellbeing)) +
geom_bar() +
theme(legend.position = "none")
df_final %>%
ggplot(aes(x = PoorPhysicalHealthDays , fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
It could be seen visually that
with increment in the Poor Physical Health days there is a
significant increase in the Poor Wellbeing of indivduals
table(df_final$PoorMentalHealthDays, df_final$wellbeing)
##
## Good_Wellbeing Poor_Wellbeing
## 0 183277 2907
## 1 9195 229
## 2 14268 550
## 3 7948 397
## 4 3895 277
## 5 9611 706
## 6 914 110
## 7 3268 342
## 8 700 78
## 9 88 19
## 10 6480 922
## 11 21 6
## 12 368 77
## 13 36 10
## 14 1191 197
## 15 5143 1151
## 16 60 16
## 17 44 17
## 18 62 28
## 19 9 3
## 20 2953 995
## 21 217 70
## 22 35 21
## 23 22 4
## 24 39 10
## 25 976 420
## 26 33 10
## 27 66 34
## 28 285 121
## 29 188 70
## 30 9491 4923
df_final %>%
ggplot(aes(x = PoorMentalHealthDays , fill = wellbeing)) +
geom_bar() +
theme(legend.position = "none")
df_final %>%
ggplot(aes(x = PoorMentalHealthDays , fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
It could be seen visually that
with increment in the Poor Mental Health days there is a significant
increase in the Poor Wellbeing of indivduals.
df_final %>%
ggplot(aes(x = PoorMentalHealthDays , fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion") +
facet_wrap(~HeavyDrinker) +
theme_bw() +
scale_fill_manual(values=my_colors2) +
theme(legend.position = "bottom")
r1 = df_final %>%
ggplot(aes(x = PoorPhysicalHealthDays , fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
r2 = df_final %>%
ggplot(aes(x = PoorMentalHealthDays , fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion")
library(patchwork)
r1+r2
#### Comparing Poor Mental, Physical and Poor Sleep Days
poor_mh_p <- df_final %>%
ggplot(aes(x = PoorMentalHealthDays , fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion") +
xlab("Number of poor mental health days") +
ggtitle("Poor Mental Health Days") +
scale_x_discrete(labels = wrap_format(10)) +
theme_bw() +
scale_fill_manual(values=my_colors2) +
theme(legend.position = "none")
poor_ph_p <- df_final %>%
ggplot(aes(x = PoorPhysicalHealthDays , fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion") +
xlab("Number of poor physical health days") +
ggtitle("Poor Physical Health Days") +
scale_x_discrete(labels = wrap_format(10)) +
theme_bw() +
scale_fill_manual(values=my_colors2) +
theme(legend.position = "none")
poor_s_p <- df_final %>%
ggplot(aes(x = PoorSleepDays , fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion") +
xlab("Number of poor sleep days") +
scale_x_discrete(labels = wrap_format(10)) +
ggtitle("Poor Sleep Days") +
theme_bw() +
scale_fill_manual(values=my_colors2) +
theme(legend.position = "none") +
theme(legend.position = "right")
poor_mh_p + poor_ph_p + poor_s_p
It is visually prominent that in both cases whenevr there is increase
in the number of Poor Physical Health days or Poor Mental Health Days
there seems to be almost linear increase in the Poor Wellbeing of the
individuals
This also coduld be deduced from the graph that there is a clear association between Poor Health Days (Mental or Physical) and the Wellbeing of an indivdual.
The more interesting part to notice in here is
that The increment in
proportions of “Poor Wellbeing” in the Poor Mental Health days is quite
more there is a significant increase in the Poor Wellbeing of
indivduals.
df_final %>% ggplot(aes(x=PoorPhysicalHealthDays,y=PoorMentalHealthDays,colour =wellbeing)) + geom_point() + geom_abline(intercept=30, slope=-1)
Visual Representation rejects my hypothesis that there is any correlation between the Poor Physical and Poor Mental heath days.
1. No presence of correlation between the Poor Mental and
Poor Health Days
2. If the Graph is colored based on the Well being
of an individual some thing very important starts to pop up :
We can visually bifurcate the presence of “Poor Wellbeing and Good
Wellbeing” with a line y=-x where the upper half denotes More Poor
Physical and More Poor Mental Health days where individauls have Poor
Wellbeing
3. The sparse distribution of points in the upper
half of the line signifies relatively lesser records of individuals who
have higher poor mental or physical health days.
df_final %>% ggplot(aes(x=SomeCollegeRate,y=HighSchoolRate,colour = wellbeing)) + geom_point() + facet_grid(~wellbeing)
There is a clear presence of correlation between the two variables. That indicates that with increase in the College Rate there is parallel increment in School Rate specific to the county.
It is also evident that the distribution more or less remains the same for both wellbeings.
We could reject the possibility of interaction on wellbeing on High School and Some College Rates
df_final %>% ggplot(aes(x=PM2.5,y=PopulationDensity,colour = wellbeing)) + geom_point() + facet_wrap(~wellbeing) +ylim(0,15000)
## Warning: Removed 919 rows containing missing values (`geom_point()`).
There is a clear presence of correlation between the two variables. That indicates with increase in the Population Density there is increment in the PM2.5 specific to the county.
It is also evident that the distribution more or less remains the same for both wellbeings.
We could reject the possibility of interaction on wellbeing on High School and Some College Rates
df_final %>% ggplot(aes(x=SomeCollegeRate,y=HighSchoolRate,colour = wellbeing)) + geom_point() + facet_grid(HeavyDrinker~wellbeing)
# library(corrplot)
# library(sjPlot)
rr_county <- df_final[,c(2,22:33)]
rr_cm <- as.matrix(rr_county)
corr_m <- cor(rr_cm, use="pairwise.complete.obs")
# correlation matrix table
tab_corr(rr_county,
na.deletion="pairwise",
corr.method="pearson",
title="Correlations with pair-wise NA deletion",
show.p=TRUE)
| age | PrematureMortality | HouseholdIncome | ParkAccess | CrimeRate | UnemploymentRate | WaterSafety | HighSchoolRate | SomeCollegeRate | AccessToRecFacility | FastFoodPercentage | PM2.5 | PopulationDensity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| age | 0.023*** | -0.056*** | -0.035*** | -0.006** | 0.021*** | 0.005* | -0.015*** | -0.038*** | -0.002 | -0.052*** | -0.016*** | -0.014*** | |
| PrematureMortality | 0.023*** | -0.681*** | -0.308*** | 0.398*** | 0.336*** | 0.103*** | -0.402*** | -0.609*** | -0.445*** | 0.276*** | 0.208*** | -0.043*** | |
| HouseholdIncome | -0.056*** | -0.681*** | 0.361*** | -0.211*** | -0.361*** | -0.093*** | 0.366*** | 0.660*** | 0.480*** | -0.034*** | -0.093*** | 0.141*** | |
| ParkAccess | -0.035*** | -0.308*** | 0.361*** | 0.290*** | -0.109*** | -0.137*** | -0.115*** | 0.486*** | 0.220*** | 0.176*** | -0.130*** | 0.373*** | |
| CrimeRate | -0.006** | 0.398*** | -0.211*** | 0.290*** | 0.268*** | -0.092*** | -0.523*** | -0.067*** | -0.122*** | 0.262*** | 0.001 | 0.273*** | |
| UnemploymentRate | 0.021*** | 0.336*** | -0.361*** | -0.109*** | 0.268*** | -0.048*** | -0.329*** | -0.427*** | -0.311*** | 0.043*** | -0.103*** | 0.019*** | |
| WaterSafety | 0.005* | 0.103*** | -0.093*** | -0.137*** | -0.092*** | -0.048*** | 0.048*** | -0.128*** | -0.064*** | 0.021*** | 0.004* | -0.071*** | |
| HighSchoolRate | -0.015*** | -0.402*** | 0.366*** | -0.115*** | -0.523*** | -0.329*** | 0.048*** | 0.242*** | 0.248*** | -0.157*** | -0.071*** | -0.168*** | |
| SomeCollegeRate | -0.038*** | -0.609*** | 0.660*** | 0.486*** | -0.067*** | -0.427*** | -0.128*** | 0.242*** | 0.530*** | -0.023*** | 0.004* | 0.200*** | |
| AccessToRecFacility | -0.002 | -0.445*** | 0.480*** | 0.220*** | -0.122*** | -0.311*** | -0.064*** | 0.248*** | 0.530*** | -0.262*** | -0.009*** | 0.182*** | |
| FastFoodPercentage | -0.052*** | 0.276*** | -0.034*** | 0.176*** | 0.262*** | 0.043*** | 0.021*** | -0.157*** | -0.023*** | -0.262*** | 0.145*** | -0.011*** | |
| PM2.5 | -0.016*** | 0.208*** | -0.093*** | -0.130*** | 0.001 | -0.103*** | 0.004* | -0.071*** | 0.004* | -0.009*** | 0.145*** | -0.020*** | |
| PopulationDensity | -0.014*** | -0.043*** | 0.141*** | 0.373*** | 0.273*** | 0.019*** | -0.071*** | -0.168*** | 0.200*** | 0.182*** | -0.011*** | -0.020*** | |
| Computed correlation used pearson-method with pairwise-deletion. | |||||||||||||
# correlogram
testRes <- cor.mtest(rr_cm, conf.level = 0.95)
corrplot.mixed(corr_m,
upper = "ellipse",
p.mat = testRes$p,
title="Correlation Matrix of Variables of Interest"
)
### Highly Correlated Groups
pairs.panels(df_final[, c(23:24,26,29:31)], scale = TRUE, ellipses = FALSE, pch = ".", lm = TRUE, rug = FALSE, stars = TRUE)
# density plots
premort_wb_plot <- df_final %>%
ggdensity( x = "PrematureMortality" ,
add = "mean", rug = FALSE,
color = "wellbeing", fill = "wellbeing", palette=my_colors, alpha=0.25)
hincome_wb_plot <- df_final %>%
ggdensity( x = "HouseholdIncome" ,
add = "mean", rug = FALSE,
color = "wellbeing", fill = "wellbeing", palette=my_colors, alpha=0.25) +
theme(legend.position="none")
park_wb_plot <- df_final %>%
ggdensity( x = "ParkAccess" ,
add = "mean", rug = FALSE,
color = "wellbeing", fill = "wellbeing", palette=my_colors, alpha=0.25) +
theme(legend.position="none")
crime_wb_plot <- df_final %>%
ggdensity( x = "CrimeRate" ,
add = "mean", rug = FALSE,
color = "wellbeing", fill = "wellbeing", palette=my_colors, alpha=0.25) +
theme(legend.position="none")
unemp_wb_plot <- df_final %>%
ggdensity( x = "UnemploymentRate" ,
add = "mean", rug = FALSE,
color = "wellbeing", fill = "wellbeing", palette=my_colors, alpha=0.25)
water_wb_plot <- df_final %>%
ggdensity( x = "WaterSafety" ,
add = "mean", rug = FALSE,
color = "wellbeing", fill = "wellbeing", palette=my_colors, alpha=0.25) +
theme(legend.position="none")
rec_wb_plot <- df_final %>%
ggdensity( x = "AccessToRecFacility" ,
add = "mean", rug = FALSE,
color = "wellbeing", fill = "wellbeing", palette=my_colors, alpha=0.25) +
theme(legend.position="none")
fastfood_wb_plot <- df_final %>%
ggdensity( x = "FastFoodPercentage" ,
add = "mean", rug = FALSE,
color = "wellbeing", fill = "wellbeing", palette=my_colors, alpha=0.25) +
theme(legend.position="none")
# premature mortality
premort_wb_plot + ggtitle("Density plot of county premature mortality by wellbeing") + theme(legend.position="right") + theme_bw()
df_final %>%
ggplot(aes(x=wellbeing, y=PrematureMortality, fill=wellbeing)) +
geom_violin(alpha=0.75) +
theme_bw() +
ggtitle("Violin Plot of Premature Mortality by Wellbeing Status") +
scale_fill_manual(values=my_colors2)
# county level factors won't changed based on wellbeing, which is individual
df_final %>%
ggplot(aes(x=HouseholdIncome, y=PrematureMortality, group=wellbeing, color=wellbeing)) +
geom_point(shape=1,alpha=0.5) +
theme_bw() +
stat_smooth(method="lm", formula=y~x, geom="smooth", se=TRUE) +
ggtitle("Scatter Plot of County-Level Mean Household Income and\nNumber of Premature Deaths per 100,000") +
facet_grid(~wellbeing)
df_final %>%
ggplot(aes(x=HouseholdIncome, y=PrematureMortality, group=wellbeing, color=wellbeing)) +
geom_point(alpha=0) +
theme_bw() +
stat_smooth(method="lm", formula=y~x, geom="smooth", se=TRUE) +
ggtitle("Trendlines of County-Level Mean Household Income and\nNumber of Premature Deaths per 100,000")
Counties with higher mean household incomes tended to have lower premature mortality rates. This trend appears consistent across individuals who reported having good wellbeing and those that reported having poor wellbeing.
df_final %>%
ggplot(aes(x=income, y=PrematureMortality, group=income, color=income)) +
geom_jitter(shape=1,alpha=0.5) +
theme_bw() +
ggtitle("Jitter Plot of Individual Income Group and\nNumber of Premature Deaths per 100,000") +
facet_grid(~wellbeing)
df_final %>%
ggplot(aes(x=income, y=PrematureMortality, group=income, color=income)) +
geom_boxplot() +
theme_bw() +
ggtitle("Box Plot of Individual Income Group and\nNumber of Premature Deaths per 100,000") +
facet_grid(~wellbeing)
hincome_wb_plot + ggtitle("Density plot of county household income by wellbeing") + theme(legend.position="right") + theme_bw()
df_final %>%
ggplot(aes(x=HouseholdIncome, group=wellbeing, fill=wellbeing)) +
geom_density() +
facet_wrap(~wellbeing, ncol=1) +
theme_bw() +
theme(legend.position = "bottom") +
scale_fill_manual(values=my_colors2)
df_final %>%
ggplot(aes(x=income, y=HouseholdIncome, group=income, color=income)) +
geom_jitter(shape=1,alpha=0.5) +
theme_bw() +
ggtitle("Jitter Plot of County-Level Mean Household Income and\nIndividual Income Group") +
facet_grid(~wellbeing)
df_final %>%
ggplot(aes(x=income, y=HouseholdIncome, group=income, color=income)) +
geom_boxplot() +
theme_bw() +
ggtitle("Box Plot of County-Level Mean Household Income and\nInividual Income Group") +
facet_grid(~wellbeing)
Among those with good wellbeing, there appear to be more indviduals living in counties with high household incomes.
# bin mean household income to match individual income bins
df_final <- df_final %>%
mutate(HouseholdIncome.bin = cut(HouseholdIncome, breaks=c(0, 24999, 34999, 49999, 119525)))
df_final$HouseholdIncome.bin <- recode_factor(df_final$HouseholdIncome.bin, "(0,2.5e+04]" = "less than 25000", "(2.5e+04,3.5e+04]" = "[25000,35000)", "(3.5e+04,5e+04]" = "[35000,50000)","(5e+04,1.2e+05]" = "50000 or more")
frq(df_final$HouseholdIncome.bin)
## x <categorical>
## # total N=275603 valid N=275603 mean=3.40 sd=0.61
##
## Value | N | Raw % | Valid % | Cum. %
## ---------------------------------------------------
## less than 25000 | 214 | 0.08 | 0.08 | 0.08
## [25000,35000) | 17292 | 6.27 | 6.27 | 6.35
## [35000,50000) | 131487 | 47.71 | 47.71 | 54.06
## 50000 or more | 126610 | 45.94 | 45.94 | 100.00
## <NA> | 0 | 0.00 | <NA> | <NA>
# create equal sized bins
library(funModeling)
df_final$HouseholdIncome.bin.equal <- equal_freq(var=df_final$HouseholdIncome, n_bins=5)
frq(df_final$HouseholdIncome.bin.equal)
## x <categorical>
## # total N=275603 valid N=275603 mean=2.99 sd=1.41
##
## Value | N | Raw % | Valid % | Cum. %
## -------------------------------------------------
## [23751, 40704) | 55467 | 20.13 | 20.13 | 20.13
## [40704, 46233) | 54919 | 19.93 | 19.93 | 40.05
## [46233, 51106) | 55490 | 20.13 | 20.13 | 60.19
## [51106, 60397) | 55150 | 20.01 | 20.01 | 80.20
## [60397,119525] | 54577 | 19.80 | 19.80 | 100.00
## <NA> | 0 | 0.00 | <NA> | <NA>
tab_xtab(df_final$HouseholdIncome.bin, df_final$wellbeing, show.col.prc = TRUE, show.exp = TRUE)
| HouseholdIncome.bin | wellbeing | Total | |
|---|---|---|---|
| Good_Wellbeing | Poor_Wellbeing | ||
| less than 25000 |
203 203 0.1 % |
11 11 0.1 % |
214 214 0.1 % |
| [25000,35000) |
16170 16368 6.2 % |
1122 924 7.6 % |
17292 17292 6.3 % |
| [35000,50000) |
123944 124464 47.5 % |
7543 7023 51.2 % |
131487 131487 47.7 % |
| 50000 or more |
120566 119848 46.2 % |
6044 6762 41.1 % |
126610 126610 45.9 % |
| Total |
260883 260883 100 % |
14720 14720 100 % |
275603 275603 100 % |
χ2=166.368 · df=3 · Cramer’s V=0.025 · p=0.000 |
# compare againt individual income
df_final %>%
group_by(HouseholdIncome.bin) %>%
ggplot(aes(x = income, fill = wellbeing)) +
geom_bar(position = "fill") +
ylab("proportion") +
facet_wrap(~HouseholdIncome.bin) +
theme(axis.text.x = element_text(angle = 45, vjust=1, hjust=1))
park_wb_plot + ggtitle("Density plot of county park access by wellbeing") + theme(legend.position="right") + theme_bw()
crime_wb_plot + ggtitle("Density plot of county crime rate by wellbeing") + theme(legend.position="right") + theme_bw()
unemp_wb_plot + ggtitle("Density plot of county unemployment rate by wellbeing") + theme(legend.position="right") + theme_bw()
water_wb_plot + ggtitle("Density plot of Water Safety Rate by wellbeing") + theme(legend.position="right") + theme_bw()
waterlog_wb_plot <- df_final %>%
mutate(WaterSafetyLog = log(WaterSafety,10)) %>%
filter(WaterSafetyLog >= 0) %>%
ggdensity( x = "WaterSafetyLog" ,
color = "wellbeing", fill = "wellbeing", alpha=0.25) +
theme(legend.position="right")
waterlog_wb_plot
# box plots
df_final %>%
ggplot(aes(x=WaterSafety, group=wellbeing, fill=wellbeing)) +
geom_boxplot() +
facet_wrap(~wellbeing, ncol=1) +
theme_bw() +
scale_fill_manual(values=my_colors2)
rec_wb_plot + ggtitle("Density plot of county recreational facility access by wellbeing") + theme(legend.position="right") + theme_bw()
fastfood_wb_plot + ggtitle("Density plot of county fast food restaurant percentage\nby wellbeing") + theme(legend.position="right") + theme_bw()
# BMI
df_final %>%
ggplot(aes(x=FastFoodPercentage, group=BMI, fill=BMI)) +
geom_density() +
facet_wrap(~BMI) +
theme_bw()
# BMI
df_final %>%
ggplot(aes(x=FastFoodPercentage, group=CVD, fill=CVD)) +
geom_density() +
facet_wrap(~CVD) +
theme_bw()
# BMI
df_final %>%
ggplot(aes(x=FastFoodPercentage, group=GeneralHealth, fill=GeneralHealth)) +
geom_density() +
facet_wrap(~GeneralHealth) +
theme_bw()
# BMI
df_final %>%
ggplot(aes(x=FastFoodPercentage, group=Diabetes, fill=Diabetes)) +
geom_density() +
facet_wrap(~Diabetes) +
theme_bw()
multiplot(premort_wb_plot, hincome_wb_plot, park_wb_plot, crime_wb_plot, unemp_wb_plot, water_wb_plot, rec_wb_plot, fastfood_wb_plot, plotlist=NULL, cols=2)
# set up variables
dependent <- "wellbeing"
pred.all <- c("age","SocialSupport","race","income","GeneralHealth","PoorPhysicalHealthDays","PoorMentalHealthDays","Asthma","HealthInsurance","CVD","LimitedActivity","Diabetes","employ","BMI","HeavyDrinker","CurrentSmoker","PoorSleepDays","PhysicalActivity","marital","PrematureMortality","HouseholdIncome","ParkAccess","CrimeRate","UnemploymentRate","WaterSafety","HighSchoolRate","SomeCollegeRate","AccessToRecFacility","FastFoodPercentage","PM2.5","PopulationDensity")
# initial model
model.all <- glm(wellbeing ~
age+
SocialSupport+
race+
income+
GeneralHealth+
PoorPhysicalHealthDays+
PoorMentalHealthDays+
Asthma+
HealthInsurance+
CVD+
LimitedActivity+
Diabetes+
employ+
BMI+
HeavyDrinker+
CurrentSmoker+
PoorSleepDays+
PhysicalActivity+
marital+
PrematureMortality+
HouseholdIncome+
ParkAccess+
CrimeRate+
UnemploymentRate+
WaterSafety+
HighSchoolRate+
SomeCollegeRate+
AccessToRecFacility+
FastFoodPercentage+
PM2.5+
PopulationDensity,
family=binomial,
data=df_final)
summary(model.all)
##
## Call:
## glm(formula = wellbeing ~ age + SocialSupport + race + income +
## GeneralHealth + PoorPhysicalHealthDays + PoorMentalHealthDays +
## Asthma + HealthInsurance + CVD + LimitedActivity + Diabetes +
## employ + BMI + HeavyDrinker + CurrentSmoker + PoorSleepDays +
## PhysicalActivity + marital + PrematureMortality + HouseholdIncome +
## ParkAccess + CrimeRate + UnemploymentRate + WaterSafety +
## HighSchoolRate + SomeCollegeRate + AccessToRecFacility +
## FastFoodPercentage + PM2.5 + PopulationDensity, family = binomial,
## data = df_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5156 -0.2318 -0.1417 -0.1015 3.6273
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.137e+00 2.325e-01 -9.194 < 2e-16 ***
## age -4.217e-03 9.191e-04 -4.588 4.48e-06 ***
## SocialSupport Sometimes -8.367e-01 3.217e-02 -26.010 < 2e-16 ***
## SocialSupportAlways -2.672e+00 3.621e-02 -73.806 < 2e-16 ***
## SocialSupportNever -7.592e-01 4.080e-02 -18.607 < 2e-16 ***
## SocialSupportUsually -2.023e+00 3.455e-02 -58.559 < 2e-16 ***
## raceWhite 3.346e-01 2.685e-02 12.460 < 2e-16 ***
## income[25000-35000) -5.318e-02 3.509e-02 -1.516 0.129613
## income[35000-50000) -1.069e-01 3.579e-02 -2.986 0.002825 **
## income50000 or more -3.181e-01 3.415e-02 -9.314 < 2e-16 ***
## incomeless than 15000 1.992e-02 2.923e-02 0.682 0.495539
## GeneralHealthFair 6.526e-01 4.807e-02 13.574 < 2e-16 ***
## GeneralHealthGood 3.519e-01 4.399e-02 8.001 1.23e-15 ***
## GeneralHealthPoor 1.095e+00 5.525e-02 19.829 < 2e-16 ***
## GeneralHealthVery good 1.928e-01 4.481e-02 4.302 1.70e-05 ***
## PoorPhysicalHealthDays 2.667e-03 1.195e-03 2.232 0.025602 *
## PoorMentalHealthDays 6.146e-02 9.248e-04 66.456 < 2e-16 ***
## AsthmaNo Asthma 4.050e-02 2.912e-02 1.391 0.164245
## HealthInsuranceWith Health Coverage -2.387e-01 2.894e-02 -8.248 < 2e-16 ***
## CVDNo Cardiovascular Disease 4.161e-02 3.596e-02 1.157 0.247193
## LimitedActivityNot Limited -4.960e-01 2.502e-02 -19.826 < 2e-16 ***
## DiabetesNon Diabetic 2.108e-03 2.899e-02 0.073 0.942033
## employRetired -1.379e-02 5.539e-02 -0.249 0.803426
## employSelf-Employed 2.665e-01 6.257e-02 4.259 2.06e-05 ***
## employStudent 2.313e-01 9.316e-02 2.483 0.013040 *
## employUnable to work 4.559e-01 5.471e-02 8.332 < 2e-16 ***
## employUnemployed for less than 1 yr 9.543e-01 6.289e-02 15.173 < 2e-16 ***
## employUnemployed for more than 1 yr 9.454e-01 5.964e-02 15.853 < 2e-16 ***
## employWaged Employement 1.724e-01 5.174e-02 3.333 0.000861 ***
## BMIObese -1.470e-02 2.624e-02 -0.560 0.575193
## BMIOverweight -7.499e-02 2.572e-02 -2.915 0.003552 **
## HeavyDrinkerNot a Heavy Drinker -2.807e-01 4.224e-02 -6.646 3.01e-11 ***
## CurrentSmokerNot a Current Smoker -2.295e-01 2.369e-02 -9.688 < 2e-16 ***
## PoorSleepDays 1.686e-02 9.536e-04 17.682 < 2e-16 ***
## PhysicalActivityPhysically Active -1.364e-01 2.205e-02 -6.184 6.23e-10 ***
## maritalNot married 4.786e-01 2.352e-02 20.346 < 2e-16 ***
## PrematureMortality -5.297e-04 2.043e-04 -2.592 0.009539 **
## HouseholdIncome 1.840e-07 1.324e-06 0.139 0.889466
## ParkAccess 2.148e-03 5.709e-04 3.763 0.000168 ***
## CrimeRate 9.953e-05 5.096e-05 1.953 0.050818 .
## UnemploymentRate 7.559e-03 4.678e-03 1.616 0.106126
## WaterSafety -2.522e-03 7.138e-04 -3.533 0.000410 ***
## HighSchoolRate -2.340e-04 1.419e-03 -0.165 0.868981
## SomeCollegeRate 7.855e-03 1.536e-03 5.115 3.13e-07 ***
## AccessToRecFacility -3.286e-03 2.662e-03 -1.234 0.217029
## FastFoodPercentage -1.703e-03 1.405e-03 -1.212 0.225588
## PM2.5 4.086e-03 7.346e-03 0.556 0.578058
## PopulationDensity 4.443e-06 2.577e-06 1.724 0.084624 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 114891 on 275602 degrees of freedom
## Residual deviance: 73633 on 275555 degrees of freedom
## AIC: 73729
##
## Number of Fisher Scoring iterations: 7
# library(corrplot)
# library(sjPlot)
rr_county <- df_final[,c(2,22:33)]
rr_cm <- as.matrix(rr_county)
corr_m <- cor(rr_cm, use="pairwise.complete.obs")
# correlation matrix table
tab_corr(rr_county,
na.deletion="pairwise",
corr.method="pearson",
title="Correlations with pair-wise NA deletion",
show.p=TRUE)
| age | PrematureMortality | HouseholdIncome | ParkAccess | CrimeRate | UnemploymentRate | WaterSafety | HighSchoolRate | SomeCollegeRate | AccessToRecFacility | FastFoodPercentage | PM2.5 | PopulationDensity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| age | 0.023*** | -0.056*** | -0.035*** | -0.006** | 0.021*** | 0.005* | -0.015*** | -0.038*** | -0.002 | -0.052*** | -0.016*** | -0.014*** | |
| PrematureMortality | 0.023*** | -0.681*** | -0.308*** | 0.398*** | 0.336*** | 0.103*** | -0.402*** | -0.609*** | -0.445*** | 0.276*** | 0.208*** | -0.043*** | |
| HouseholdIncome | -0.056*** | -0.681*** | 0.361*** | -0.211*** | -0.361*** | -0.093*** | 0.366*** | 0.660*** | 0.480*** | -0.034*** | -0.093*** | 0.141*** | |
| ParkAccess | -0.035*** | -0.308*** | 0.361*** | 0.290*** | -0.109*** | -0.137*** | -0.115*** | 0.486*** | 0.220*** | 0.176*** | -0.130*** | 0.373*** | |
| CrimeRate | -0.006** | 0.398*** | -0.211*** | 0.290*** | 0.268*** | -0.092*** | -0.523*** | -0.067*** | -0.122*** | 0.262*** | 0.001 | 0.273*** | |
| UnemploymentRate | 0.021*** | 0.336*** | -0.361*** | -0.109*** | 0.268*** | -0.048*** | -0.329*** | -0.427*** | -0.311*** | 0.043*** | -0.103*** | 0.019*** | |
| WaterSafety | 0.005* | 0.103*** | -0.093*** | -0.137*** | -0.092*** | -0.048*** | 0.048*** | -0.128*** | -0.064*** | 0.021*** | 0.004* | -0.071*** | |
| HighSchoolRate | -0.015*** | -0.402*** | 0.366*** | -0.115*** | -0.523*** | -0.329*** | 0.048*** | 0.242*** | 0.248*** | -0.157*** | -0.071*** | -0.168*** | |
| SomeCollegeRate | -0.038*** | -0.609*** | 0.660*** | 0.486*** | -0.067*** | -0.427*** | -0.128*** | 0.242*** | 0.530*** | -0.023*** | 0.004* | 0.200*** | |
| AccessToRecFacility | -0.002 | -0.445*** | 0.480*** | 0.220*** | -0.122*** | -0.311*** | -0.064*** | 0.248*** | 0.530*** | -0.262*** | -0.009*** | 0.182*** | |
| FastFoodPercentage | -0.052*** | 0.276*** | -0.034*** | 0.176*** | 0.262*** | 0.043*** | 0.021*** | -0.157*** | -0.023*** | -0.262*** | 0.145*** | -0.011*** | |
| PM2.5 | -0.016*** | 0.208*** | -0.093*** | -0.130*** | 0.001 | -0.103*** | 0.004* | -0.071*** | 0.004* | -0.009*** | 0.145*** | -0.020*** | |
| PopulationDensity | -0.014*** | -0.043*** | 0.141*** | 0.373*** | 0.273*** | 0.019*** | -0.071*** | -0.168*** | 0.200*** | 0.182*** | -0.011*** | -0.020*** | |
| Computed correlation used pearson-method with pairwise-deletion. | |||||||||||||
# correlogram
testRes <- cor.mtest(rr_cm, conf.level = 0.95)
corrplot.mixed(corr_m,
upper = "ellipse",
p.mat = testRes$p,
title="Correlation Matrix of Variables of Interest",
tl.pos = "lt")
##### Highly Correlated Groups
pairs.panels(df_final[, c(23:24,26,29:31)], scale = TRUE, ellipses = FALSE, pch = ".", lm = TRUE, rug = FALSE, stars = TRUE)
rr_county_sig <- df_final[,c(2,22,24,25,26,27,29)]
rr_cm_sig <- as.matrix(rr_county_sig)
corr_m_sig <- cor(rr_cm_sig, use="pairwise.complete.obs")
# correlogram
testRes_sig <- cor.mtest(rr_cm_sig, conf.level = 0.95)
corrplot.mixed(corr_m_sig,
upper = "ellipse",
p.mat = testRes_sig$p,
tl.pos = "lt")
library(car)
library(finalfit)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:patchwork':
##
## area
## The following object is masked from 'package:dplyr':
##
## select
vif_all <- vif(model.all)
vif_all <- as.data.frame(vif_all)
#create horizontal bar chart to display each VIF value
vif_plot_all <- vif_all %>%
ggplot(aes(x=rownames(vif_all), y=GVIF)) +
geom_col(color="#434343", fill="#6BD6DB") +
xlab("All Predictors") +
ylab("VIF Values") +
ggtitle("VIF Values for All Predictors") +
theme_bw() +
geom_hline(aes(yintercept=5),color="#434343", linetype="dashed", linewidth=0.5) +
coord_flip()
vif_plot_all
step <- stepAIC(model.all, direction="both") # Stepwise function
## Start: AIC=73729.1
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth +
## PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma +
## HealthInsurance + CVD + LimitedActivity + Diabetes + employ +
## BMI + HeavyDrinker + CurrentSmoker + PoorSleepDays + PhysicalActivity +
## marital + PrematureMortality + HouseholdIncome + ParkAccess +
## CrimeRate + UnemploymentRate + WaterSafety + HighSchoolRate +
## SomeCollegeRate + AccessToRecFacility + FastFoodPercentage +
## PM2.5 + PopulationDensity
##
## Df Deviance AIC
## - Diabetes 1 73633 73727
## - HouseholdIncome 1 73633 73727
## - HighSchoolRate 1 73633 73727
## - PM2.5 1 73633 73727
## - CVD 1 73634 73728
## - FastFoodPercentage 1 73635 73729
## - AccessToRecFacility 1 73635 73729
## - Asthma 1 73635 73729
## <none> 73633 73729
## - UnemploymentRate 1 73636 73730
## - PopulationDensity 1 73636 73730
## - CrimeRate 1 73637 73731
## - PoorPhysicalHealthDays 1 73638 73732
## - PrematureMortality 1 73640 73734
## - BMI 2 73643 73735
## - WaterSafety 1 73646 73740
## - ParkAccess 1 73647 73741
## - age 1 73654 73748
## - SomeCollegeRate 1 73659 73753
## - PhysicalActivity 1 73671 73765
## - HeavyDrinker 1 73675 73769
## - HealthInsurance 1 73700 73794
## - CurrentSmoker 1 73726 73820
## - income 4 73737 73825
## - race 1 73792 73886
## - PoorSleepDays 1 73940 74034
## - LimitedActivity 1 74022 74116
## - marital 1 74053 74147
## - GeneralHealth 4 74150 74238
## - employ 7 74352 74434
## - PoorMentalHealthDays 1 77897 77991
## - SocialSupport 4 81619 81707
##
## Step: AIC=73727.11
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth +
## PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma +
## HealthInsurance + CVD + LimitedActivity + employ + BMI +
## HeavyDrinker + CurrentSmoker + PoorSleepDays + PhysicalActivity +
## marital + PrematureMortality + HouseholdIncome + ParkAccess +
## CrimeRate + UnemploymentRate + WaterSafety + HighSchoolRate +
## SomeCollegeRate + AccessToRecFacility + FastFoodPercentage +
## PM2.5 + PopulationDensity
##
## Df Deviance AIC
## - HouseholdIncome 1 73633 73725
## - HighSchoolRate 1 73633 73725
## - PM2.5 1 73633 73725
## - CVD 1 73634 73726
## - FastFoodPercentage 1 73635 73727
## - AccessToRecFacility 1 73635 73727
## - Asthma 1 73635 73727
## <none> 73633 73727
## - UnemploymentRate 1 73636 73728
## - PopulationDensity 1 73636 73728
## - CrimeRate 1 73637 73729
## + Diabetes 1 73633 73729
## - PoorPhysicalHealthDays 1 73638 73730
## - PrematureMortality 1 73640 73732
## - BMI 2 73643 73733
## - WaterSafety 1 73646 73738
## - ParkAccess 1 73647 73739
## - age 1 73654 73746
## - SomeCollegeRate 1 73659 73751
## - PhysicalActivity 1 73671 73763
## - HeavyDrinker 1 73676 73768
## - HealthInsurance 1 73700 73792
## - CurrentSmoker 1 73726 73818
## - income 4 73737 73823
## - race 1 73793 73885
## - PoorSleepDays 1 73940 74032
## - LimitedActivity 1 74022 74114
## - marital 1 74053 74145
## - GeneralHealth 4 74157 74243
## - employ 7 74352 74432
## - PoorMentalHealthDays 1 77898 77990
## - SocialSupport 4 81619 81705
##
## Step: AIC=73725.13
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth +
## PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma +
## HealthInsurance + CVD + LimitedActivity + employ + BMI +
## HeavyDrinker + CurrentSmoker + PoorSleepDays + PhysicalActivity +
## marital + PrematureMortality + ParkAccess + CrimeRate + UnemploymentRate +
## WaterSafety + HighSchoolRate + SomeCollegeRate + AccessToRecFacility +
## FastFoodPercentage + PM2.5 + PopulationDensity
##
## Df Deviance AIC
## - HighSchoolRate 1 73633 73723
## - PM2.5 1 73633 73723
## - CVD 1 73634 73724
## - FastFoodPercentage 1 73635 73725
## - AccessToRecFacility 1 73635 73725
## - Asthma 1 73635 73725
## <none> 73633 73725
## - UnemploymentRate 1 73636 73726
## - PopulationDensity 1 73636 73726
## - CrimeRate 1 73637 73727
## + HouseholdIncome 1 73633 73727
## + Diabetes 1 73633 73727
## - PoorPhysicalHealthDays 1 73638 73728
## - BMI 2 73643 73731
## - PrematureMortality 1 73641 73731
## - WaterSafety 1 73646 73736
## - ParkAccess 1 73647 73737
## - age 1 73654 73744
## - SomeCollegeRate 1 73662 73752
## - PhysicalActivity 1 73671 73761
## - HeavyDrinker 1 73676 73766
## - HealthInsurance 1 73700 73790
## - CurrentSmoker 1 73726 73816
## - income 4 73738 73822
## - race 1 73793 73883
## - PoorSleepDays 1 73940 74030
## - LimitedActivity 1 74022 74112
## - marital 1 74053 74143
## - GeneralHealth 4 74157 74241
## - employ 7 74352 74430
## - PoorMentalHealthDays 1 77899 77989
## - SocialSupport 4 81620 81704
##
## Step: AIC=73723.15
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth +
## PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma +
## HealthInsurance + CVD + LimitedActivity + employ + BMI +
## HeavyDrinker + CurrentSmoker + PoorSleepDays + PhysicalActivity +
## marital + PrematureMortality + ParkAccess + CrimeRate + UnemploymentRate +
## WaterSafety + SomeCollegeRate + AccessToRecFacility + FastFoodPercentage +
## PM2.5 + PopulationDensity
##
## Df Deviance AIC
## - PM2.5 1 73633 73721
## - CVD 1 73635 73723
## - FastFoodPercentage 1 73635 73723
## - AccessToRecFacility 1 73635 73723
## - Asthma 1 73635 73723
## <none> 73633 73723
## - UnemploymentRate 1 73636 73724
## - PopulationDensity 1 73636 73724
## + HighSchoolRate 1 73633 73725
## + HouseholdIncome 1 73633 73725
## + Diabetes 1 73633 73725
## - CrimeRate 1 73638 73726
## - PoorPhysicalHealthDays 1 73638 73726
## - BMI 2 73643 73729
## - PrematureMortality 1 73641 73729
## - WaterSafety 1 73646 73734
## - ParkAccess 1 73648 73736
## - age 1 73654 73742
## - SomeCollegeRate 1 73662 73750
## - PhysicalActivity 1 73671 73759
## - HeavyDrinker 1 73676 73764
## - HealthInsurance 1 73700 73788
## - CurrentSmoker 1 73726 73814
## - income 4 73738 73820
## - race 1 73793 73881
## - PoorSleepDays 1 73941 74029
## - LimitedActivity 1 74022 74110
## - marital 1 74053 74141
## - GeneralHealth 4 74157 74239
## - employ 7 74352 74428
## - PoorMentalHealthDays 1 77899 77987
## - SocialSupport 4 81620 81702
##
## Step: AIC=73721.47
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth +
## PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma +
## HealthInsurance + CVD + LimitedActivity + employ + BMI +
## HeavyDrinker + CurrentSmoker + PoorSleepDays + PhysicalActivity +
## marital + PrematureMortality + ParkAccess + CrimeRate + UnemploymentRate +
## WaterSafety + SomeCollegeRate + AccessToRecFacility + FastFoodPercentage +
## PopulationDensity
##
## Df Deviance AIC
## - FastFoodPercentage 1 73635 73721
## - CVD 1 73635 73721
## - AccessToRecFacility 1 73635 73721
## - Asthma 1 73635 73721
## <none> 73633 73721
## - UnemploymentRate 1 73636 73722
## - PopulationDensity 1 73636 73722
## + PM2.5 1 73633 73723
## + HighSchoolRate 1 73633 73723
## + HouseholdIncome 1 73633 73723
## + Diabetes 1 73633 73723
## - CrimeRate 1 73638 73724
## - PoorPhysicalHealthDays 1 73638 73724
## - BMI 2 73643 73727
## - PrematureMortality 1 73641 73727
## - WaterSafety 1 73646 73732
## - ParkAccess 1 73648 73734
## - age 1 73655 73741
## - SomeCollegeRate 1 73663 73749
## - PhysicalActivity 1 73672 73758
## - HeavyDrinker 1 73676 73762
## - HealthInsurance 1 73701 73787
## - CurrentSmoker 1 73726 73812
## - income 4 73738 73818
## - race 1 73795 73881
## - PoorSleepDays 1 73941 74027
## - LimitedActivity 1 74022 74108
## - marital 1 74054 74140
## - GeneralHealth 4 74158 74238
## - employ 7 74353 74427
## - PoorMentalHealthDays 1 77899 77985
## - SocialSupport 4 81620 81700
##
## Step: AIC=73720.8
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth +
## PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma +
## HealthInsurance + CVD + LimitedActivity + employ + BMI +
## HeavyDrinker + CurrentSmoker + PoorSleepDays + PhysicalActivity +
## marital + PrematureMortality + ParkAccess + CrimeRate + UnemploymentRate +
## WaterSafety + SomeCollegeRate + AccessToRecFacility + PopulationDensity
##
## Df Deviance AIC
## - AccessToRecFacility 1 73636 73720
## - CVD 1 73636 73720
## - Asthma 1 73637 73721
## <none> 73635 73721
## + FastFoodPercentage 1 73633 73721
## - UnemploymentRate 1 73638 73722
## - PopulationDensity 1 73638 73722
## + PM2.5 1 73635 73723
## + HighSchoolRate 1 73635 73723
## + HouseholdIncome 1 73635 73723
## + Diabetes 1 73635 73723
## - CrimeRate 1 73639 73723
## - PoorPhysicalHealthDays 1 73640 73724
## - BMI 2 73645 73727
## - PrematureMortality 1 73645 73729
## - ParkAccess 1 73648 73732
## - WaterSafety 1 73648 73732
## - age 1 73656 73740
## - SomeCollegeRate 1 73664 73748
## - PhysicalActivity 1 73673 73757
## - HeavyDrinker 1 73677 73761
## - HealthInsurance 1 73702 73786
## - CurrentSmoker 1 73728 73812
## - income 4 73740 73818
## - race 1 73798 73882
## - PoorSleepDays 1 73942 74026
## - LimitedActivity 1 74024 74108
## - marital 1 74056 74140
## - GeneralHealth 4 74158 74236
## - employ 7 74355 74427
## - PoorMentalHealthDays 1 77900 77984
## - SocialSupport 4 81622 81700
##
## Step: AIC=73719.81
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth +
## PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma +
## HealthInsurance + CVD + LimitedActivity + employ + BMI +
## HeavyDrinker + CurrentSmoker + PoorSleepDays + PhysicalActivity +
## marital + PrematureMortality + ParkAccess + CrimeRate + UnemploymentRate +
## WaterSafety + SomeCollegeRate + PopulationDensity
##
## Df Deviance AIC
## - CVD 1 73637 73719
## - Asthma 1 73638 73720
## <none> 73636 73720
## + AccessToRecFacility 1 73635 73721
## - PopulationDensity 1 73639 73721
## - UnemploymentRate 1 73639 73721
## + FastFoodPercentage 1 73635 73721
## + PM2.5 1 73636 73722
## + HighSchoolRate 1 73636 73722
## + HouseholdIncome 1 73636 73722
## + Diabetes 1 73636 73722
## - CrimeRate 1 73640 73722
## - PoorPhysicalHealthDays 1 73641 73723
## - BMI 2 73646 73726
## - PrematureMortality 1 73645 73727
## - WaterSafety 1 73649 73731
## - ParkAccess 1 73649 73731
## - age 1 73657 73739
## - SomeCollegeRate 1 73664 73746
## - PhysicalActivity 1 73674 73756
## - HeavyDrinker 1 73678 73760
## - HealthInsurance 1 73703 73785
## - CurrentSmoker 1 73729 73811
## - income 4 73741 73817
## - race 1 73798 73880
## - PoorSleepDays 1 73943 74025
## - LimitedActivity 1 74025 74107
## - marital 1 74056 74138
## - GeneralHealth 4 74160 74236
## - employ 7 74355 74425
## - PoorMentalHealthDays 1 77900 77982
## - SocialSupport 4 81623 81699
##
## Step: AIC=73719.15
## wellbeing ~ age + SocialSupport + race + income + GeneralHealth +
## PoorPhysicalHealthDays + PoorMentalHealthDays + Asthma +
## HealthInsurance + LimitedActivity + employ + BMI + HeavyDrinker +
## CurrentSmoker + PoorSleepDays + PhysicalActivity + marital +
## PrematureMortality + ParkAccess + CrimeRate + UnemploymentRate +
## WaterSafety + SomeCollegeRate + PopulationDensity
##
## Df Deviance AIC
## <none> 73637 73719
## - Asthma 1 73639 73719
## + CVD 1 73636 73720
## - PopulationDensity 1 73640 73720
## + AccessToRecFacility 1 73636 73720
## - UnemploymentRate 1 73640 73720
## + FastFoodPercentage 1 73636 73720
## + PM2.5 1 73637 73721
## + HighSchoolRate 1 73637 73721
## + HouseholdIncome 1 73637 73721
## + Diabetes 1 73637 73721
## - CrimeRate 1 73641 73721
## - PoorPhysicalHealthDays 1 73642 73722
## - BMI 2 73647 73725
## - PrematureMortality 1 73647 73727
## - WaterSafety 1 73650 73730
## - ParkAccess 1 73651 73731
## - age 1 73660 73740
## - SomeCollegeRate 1 73666 73746
## - PhysicalActivity 1 73675 73755
## - HeavyDrinker 1 73680 73760
## - HealthInsurance 1 73705 73785
## - CurrentSmoker 1 73730 73810
## - income 4 73743 73817
## - race 1 73799 73879
## - PoorSleepDays 1 73943 74023
## - LimitedActivity 1 74026 74106
## - marital 1 74058 74138
## - GeneralHealth 4 74163 74237
## - employ 7 74357 74425
## - PoorMentalHealthDays 1 77904 77984
## - SocialSupport 4 81625 81699
model.step <- glm(formula= df_final$wellbeing ~ . ,
df_final[,(colnames(step$model[-1]))], family=binomial(logit))
summary(model.step)
##
## Call:
## glm(formula = df_final$wellbeing ~ ., family = binomial(logit),
## data = df_final[, (colnames(step$model[-1]))])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5218 -0.2318 -0.1418 -0.1015 3.6337
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.136e+00 1.700e-01 -12.567 < 2e-16 ***
## age -4.343e-03 9.086e-04 -4.779 1.76e-06 ***
## SocialSupport Sometimes -8.361e-01 3.216e-02 -25.998 < 2e-16 ***
## SocialSupportAlways -2.672e+00 3.620e-02 -73.814 < 2e-16 ***
## SocialSupportNever -7.592e-01 4.079e-02 -18.610 < 2e-16 ***
## SocialSupportUsually -2.023e+00 3.454e-02 -58.556 < 2e-16 ***
## raceWhite 3.358e-01 2.672e-02 12.566 < 2e-16 ***
## income[25000-35000) -5.297e-02 3.508e-02 -1.510 0.130996
## income[35000-50000) -1.068e-01 3.577e-02 -2.987 0.002815 **
## income50000 or more -3.186e-01 3.396e-02 -9.380 < 2e-16 ***
## incomeless than 15000 1.940e-02 2.921e-02 0.664 0.506674
## GeneralHealthFair 6.493e-01 4.789e-02 13.559 < 2e-16 ***
## GeneralHealthGood 3.510e-01 4.396e-02 7.985 1.40e-15 ***
## GeneralHealthPoor 1.088e+00 5.471e-02 19.892 < 2e-16 ***
## GeneralHealthVery good 1.929e-01 4.481e-02 4.305 1.67e-05 ***
## PoorPhysicalHealthDays 2.666e-03 1.194e-03 2.232 0.025636 *
## PoorMentalHealthDays 6.145e-02 9.244e-04 66.482 < 2e-16 ***
## AsthmaNo Asthma 4.194e-02 2.910e-02 1.441 0.149508
## HealthInsuranceWith Health Coverage -2.390e-01 2.891e-02 -8.267 < 2e-16 ***
## LimitedActivityNot Limited -4.958e-01 2.500e-02 -19.835 < 2e-16 ***
## employRetired -1.492e-02 5.537e-02 -0.269 0.787574
## employSelf-Employed 2.677e-01 6.254e-02 4.280 1.87e-05 ***
## employStudent 2.304e-01 9.313e-02 2.474 0.013351 *
## employUnable to work 4.554e-01 5.468e-02 8.328 < 2e-16 ***
## employUnemployed for less than 1 yr 9.542e-01 6.288e-02 15.176 < 2e-16 ***
## employUnemployed for more than 1 yr 9.460e-01 5.960e-02 15.873 < 2e-16 ***
## employWaged Employement 1.731e-01 5.172e-02 3.346 0.000820 ***
## BMIObese -1.599e-02 2.571e-02 -0.622 0.534076
## BMIOverweight -7.575e-02 2.566e-02 -2.952 0.003156 **
## HeavyDrinkerNot a Heavy Drinker -2.813e-01 4.221e-02 -6.663 2.68e-11 ***
## CurrentSmokerNot a Current Smoker -2.299e-01 2.367e-02 -9.710 < 2e-16 ***
## PoorSleepDays 1.682e-02 9.530e-04 17.650 < 2e-16 ***
## PhysicalActivityPhysically Active -1.360e-01 2.203e-02 -6.173 6.70e-10 ***
## maritalNot married 4.790e-01 2.350e-02 20.383 < 2e-16 ***
## PrematureMortality -5.347e-04 1.725e-04 -3.100 0.001934 **
## ParkAccess 2.041e-03 5.496e-04 3.714 0.000204 ***
## CrimeRate 9.690e-05 4.793e-05 2.022 0.043191 *
## UnemploymentRate 8.125e-03 4.568e-03 1.778 0.075334 .
## WaterSafety -2.551e-03 7.127e-04 -3.580 0.000344 ***
## SomeCollegeRate 7.366e-03 1.383e-03 5.326 1.00e-07 ***
## PopulationDensity 4.423e-06 2.497e-06 1.771 0.076483 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 114891 on 275602 degrees of freedom
## Residual deviance: 73637 on 275562 degrees of freedom
## AIC: 73719
##
## Number of Fisher Scoring iterations: 7
# setup variable
pred.m1 <- c("age","SocialSupport","race","income","GeneralHealth","PoorPhysicalHealthDays","PoorMentalHealthDays","HealthInsurance","LimitedActivity","employ","BMI","HeavyDrinker","CurrentSmoker","PoorSleepDays","PhysicalActivity","marital","PrematureMortality","ParkAccess","CrimeRate","WaterSafety","SomeCollegeRate")
# model with reduced predictors
model.1 <- glm(wellbeing ~
age+
SocialSupport+
race+
income+
GeneralHealth+
PoorPhysicalHealthDays+
PoorMentalHealthDays+
HealthInsurance+
LimitedActivity+
employ+
BMI+
HeavyDrinker+
CurrentSmoker+
PoorSleepDays+
PhysicalActivity+
marital+
PrematureMortality+
ParkAccess+
CrimeRate+
WaterSafety+
SomeCollegeRate,
family=binomial,
data=df_final)
summary(model.1)
##
## Call:
## glm(formula = wellbeing ~ age + SocialSupport + race + income +
## GeneralHealth + PoorPhysicalHealthDays + PoorMentalHealthDays +
## HealthInsurance + LimitedActivity + employ + BMI + HeavyDrinker +
## CurrentSmoker + PoorSleepDays + PhysicalActivity + marital +
## PrematureMortality + ParkAccess + CrimeRate + WaterSafety +
## SomeCollegeRate, family = binomial, data = df_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5358 -0.2317 -0.1418 -0.1015 3.6273
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.9979684 0.1526996 -13.084 < 2e-16 ***
## age -0.0042646 0.0009079 -4.697 2.64e-06 ***
## SocialSupport Sometimes -0.8359781 0.0321565 -25.997 < 2e-16 ***
## SocialSupportAlways -2.6729660 0.0361949 -73.849 < 2e-16 ***
## SocialSupportNever -0.7586071 0.0407889 -18.598 < 2e-16 ***
## SocialSupportUsually -2.0231561 0.0345371 -58.579 < 2e-16 ***
## raceWhite 0.3310936 0.0266468 12.425 < 2e-16 ***
## income[25000-35000) -0.0528918 0.0350735 -1.508 0.131547
## income[35000-50000) -0.1060674 0.0357617 -2.966 0.003018 **
## income50000 or more -0.3164002 0.0339561 -9.318 < 2e-16 ***
## incomeless than 15000 0.0185804 0.0292012 0.636 0.524588
## GeneralHealthFair 0.6465787 0.0478520 13.512 < 2e-16 ***
## GeneralHealthGood 0.3494260 0.0439486 7.951 1.85e-15 ***
## GeneralHealthPoor 1.0840043 0.0546523 19.835 < 2e-16 ***
## GeneralHealthVery good 0.1922407 0.0448107 4.290 1.79e-05 ***
## PoorPhysicalHealthDays 0.0026565 0.0011945 2.224 0.026155 *
## PoorMentalHealthDays 0.0614357 0.0009242 66.477 < 2e-16 ***
## HealthInsuranceWith Health Coverage -0.2399066 0.0288953 -8.303 < 2e-16 ***
## LimitedActivityNot Limited -0.4931359 0.0249503 -19.765 < 2e-16 ***
## employRetired -0.0141648 0.0553658 -0.256 0.798075
## employSelf-Employed 0.2688574 0.0625238 4.300 1.71e-05 ***
## employStudent 0.2311005 0.0931280 2.482 0.013082 *
## employUnable to work 0.4548113 0.0546684 8.319 < 2e-16 ***
## employUnemployed for less than 1 yr 0.9561157 0.0628808 15.205 < 2e-16 ***
## employUnemployed for more than 1 yr 0.9496466 0.0595914 15.936 < 2e-16 ***
## employWaged Employement 0.1723412 0.0517107 3.333 0.000860 ***
## BMIObese -0.0198489 0.0256516 -0.774 0.439056
## BMIOverweight -0.0769001 0.0256529 -2.998 0.002720 **
## HeavyDrinkerNot a Heavy Drinker -0.2831470 0.0422020 -6.709 1.96e-11 ***
## CurrentSmokerNot a Current Smoker -0.2287189 0.0236696 -9.663 < 2e-16 ***
## PoorSleepDays 0.0167672 0.0009523 17.608 < 2e-16 ***
## PhysicalActivityPhysically Active -0.1353385 0.0220301 -6.143 8.08e-10 ***
## maritalNot married 0.4798601 0.0234950 20.424 < 2e-16 ***
## PrematureMortality -0.0005309 0.0001725 -3.078 0.002086 **
## ParkAccess 0.0022832 0.0005331 4.283 1.85e-05 ***
## CrimeRate 0.0001229 0.0000468 2.626 0.008652 **
## WaterSafety -0.0026511 0.0007098 -3.735 0.000188 ***
## SomeCollegeRate 0.0066705 0.0012933 5.158 2.50e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 114891 on 275602 degrees of freedom
## Residual deviance: 73645 on 275565 degrees of freedom
## AIC: 73721
##
## Number of Fisher Scoring iterations: 7
# library(corrplot)
# library(sjPlot)
rr_county.m1 <- df_final[,c(2,22, 24:27,29)]
rr_cm.m1 <- as.matrix(rr_county.m1)
corr_m1 <- cor(rr_cm.m1, use="pairwise.complete.obs")
# correlogram
testRes.m1 <- cor.mtest(rr_cm.m1, conf.level = 0.95)
corrplot.mixed(corr_m1,
upper = "ellipse",
p.mat = testRes.m1$p,
title="Correlation Matrix of Variables of Interest")
vif_1 <- vif(model.1)
vif_1 <- as.data.frame(vif_1)
#create horizontal bar chart to display each VIF value
vif_plot_1 <- vif_1 %>%
ggplot(aes(x=rownames(vif_1), y=GVIF)) +
geom_col(color="#434343", fill="#FFB87B") +
xlab("Statistically Significant Predictors") +
ylab("VIF Values") +
ggtitle("VIF Values for Reduced Model") +
theme_bw() +
geom_hline(aes(yintercept=5),color="#434343", linetype="dashed", linewidth=0.5) +
coord_flip()
vif_plot_1
# forest plot
library(finalfit)
# df_final %>%
# or_plot(dependent, pred.all)
# library(broom)
# model.1 %>%
# tidy(conf.int=TRUE, exp=TRUE) %>%
# datatable(rownames=FALSE,
# colnames=c("predictor", "odds ratio", "std error", "statistic", "p-value", "CI low", "CI high"),
# options=list(pageLength=15, scrollX=T))
# forest plot
# library(finalfit)
df_final %>%
or_plot(dependent, pred.m1)
## Waiting for profiling to be done...
## Waiting for profiling to be done...
## Waiting for profiling to be done...
## Warning: Removed 12 rows containing missing values (`geom_errorbarh()`).
Model_Int2 <- glm(wellbeing ~
age +
race:income +
SocialSupport +
GeneralHealth +
PoorPhysicalHealthDays +
PoorMentalHealthDays +
HealthInsurance +
LimitedActivity +
employ +
BMI +
HeavyDrinker +
CurrentSmoker +
PoorSleepDays +
PhysicalActivity +
marital +
PrematureMortality +
ParkAccess +
CrimeRate +
WaterSafety +
SomeCollegeRate,
family = "binomial", df_final)
summary(Model_Int2)
##
## Call:
## glm(formula = wellbeing ~ age + race:income + SocialSupport +
## GeneralHealth + PoorPhysicalHealthDays + PoorMentalHealthDays +
## HealthInsurance + LimitedActivity + employ + BMI + HeavyDrinker +
## CurrentSmoker + PoorSleepDays + PhysicalActivity + marital +
## PrematureMortality + ParkAccess + CrimeRate + WaterSafety +
## SomeCollegeRate, family = "binomial", data = df_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5374 -0.2319 -0.1414 -0.1015 3.5655
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.600e+00 1.526e-01 -10.488 < 2e-16
## age -4.271e-03 9.084e-04 -4.702 2.57e-06
## SocialSupport Sometimes -8.353e-01 3.216e-02 -25.973 < 2e-16
## SocialSupportAlways -2.673e+00 3.620e-02 -73.850 < 2e-16
## SocialSupportNever -7.567e-01 4.078e-02 -18.557 < 2e-16
## SocialSupportUsually -2.022e+00 3.454e-02 -58.533 < 2e-16
## GeneralHealthFair 6.453e-01 4.784e-02 13.488 < 2e-16
## GeneralHealthGood 3.459e-01 4.395e-02 7.871 3.53e-15
## GeneralHealthPoor 1.082e+00 5.466e-02 19.803 < 2e-16
## GeneralHealthVery good 1.911e-01 4.481e-02 4.265 2.00e-05
## PoorPhysicalHealthDays 2.579e-03 1.195e-03 2.158 0.03096
## PoorMentalHealthDays 6.143e-02 9.244e-04 66.447 < 2e-16
## HealthInsuranceWith Health Coverage -2.405e-01 2.890e-02 -8.323 < 2e-16
## LimitedActivityNot Limited -4.920e-01 2.497e-02 -19.704 < 2e-16
## employRetired -2.063e-02 5.537e-02 -0.373 0.70939
## employSelf-Employed 2.652e-01 6.254e-02 4.240 2.24e-05
## employStudent 2.231e-01 9.309e-02 2.397 0.01655
## employUnable to work 4.505e-01 5.468e-02 8.239 < 2e-16
## employUnemployed for less than 1 yr 9.508e-01 6.289e-02 15.119 < 2e-16
## employUnemployed for more than 1 yr 9.452e-01 5.960e-02 15.859 < 2e-16
## employWaged Employement 1.664e-01 5.173e-02 3.216 0.00130
## BMIObese -1.797e-02 2.566e-02 -0.700 0.48384
## BMIOverweight -7.519e-02 2.566e-02 -2.930 0.00339
## HeavyDrinkerNot a Heavy Drinker -2.879e-01 4.224e-02 -6.816 9.34e-12
## CurrentSmokerNot a Current Smoker -2.264e-01 2.369e-02 -9.557 < 2e-16
## PoorSleepDays 1.673e-02 9.524e-04 17.567 < 2e-16
## PhysicalActivityPhysically Active -1.343e-01 2.203e-02 -6.095 1.09e-09
## maritalNot married 4.741e-01 2.351e-02 20.163 < 2e-16
## PrematureMortality -5.271e-04 1.726e-04 -3.053 0.00226
## ParkAccess 2.267e-03 5.332e-04 4.252 2.12e-05
## CrimeRate 1.275e-04 4.681e-05 2.724 0.00646
## WaterSafety -2.661e-03 7.106e-04 -3.745 0.00018
## SomeCollegeRate 6.769e-03 1.294e-03 5.231 1.68e-07
## raceOther races:income[15000-25000) -4.195e-01 4.816e-02 -8.710 < 2e-16
## raceWhite:income[15000-25000) -5.328e-02 3.418e-02 -1.559 0.11896
## raceOther races:income[25000-35000) -4.965e-01 6.992e-02 -7.101 1.24e-12
## raceWhite:income[25000-35000) -1.031e-01 4.202e-02 -2.455 0.01411
## raceOther races:income[35000-50000) -3.507e-01 7.278e-02 -4.818 1.45e-06
## raceWhite:income[35000-50000) -2.006e-01 4.274e-02 -4.694 2.68e-06
## raceOther races:income50000 or more -4.924e-01 6.511e-02 -7.562 3.97e-14
## raceWhite:income50000 or more -4.157e-01 4.090e-02 -10.164 < 2e-16
## raceOther races:incomeless than 15000 -4.644e-01 4.475e-02 -10.376 < 2e-16
## raceWhite:incomeless than 15000 NA NA NA NA
##
## (Intercept) ***
## age ***
## SocialSupport Sometimes ***
## SocialSupportAlways ***
## SocialSupportNever ***
## SocialSupportUsually ***
## GeneralHealthFair ***
## GeneralHealthGood ***
## GeneralHealthPoor ***
## GeneralHealthVery good ***
## PoorPhysicalHealthDays *
## PoorMentalHealthDays ***
## HealthInsuranceWith Health Coverage ***
## LimitedActivityNot Limited ***
## employRetired
## employSelf-Employed ***
## employStudent *
## employUnable to work ***
## employUnemployed for less than 1 yr ***
## employUnemployed for more than 1 yr ***
## employWaged Employement **
## BMIObese
## BMIOverweight **
## HeavyDrinkerNot a Heavy Drinker ***
## CurrentSmokerNot a Current Smoker ***
## PoorSleepDays ***
## PhysicalActivityPhysically Active ***
## maritalNot married ***
## PrematureMortality **
## ParkAccess ***
## CrimeRate **
## WaterSafety ***
## SomeCollegeRate ***
## raceOther races:income[15000-25000) ***
## raceWhite:income[15000-25000)
## raceOther races:income[25000-35000) ***
## raceWhite:income[25000-35000) *
## raceOther races:income[35000-50000) ***
## raceWhite:income[35000-50000) ***
## raceOther races:income50000 or more ***
## raceWhite:income50000 or more ***
## raceOther races:incomeless than 15000 ***
## raceWhite:incomeless than 15000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 114891 on 275602 degrees of freedom
## Residual deviance: 73612 on 275561 degrees of freedom
## AIC: 73696
##
## Number of Fisher Scoring iterations: 7
Model_Int3 <- glm(wellbeing ~
age +
race:GeneralHealth +
SocialSupport +
income +
PoorPhysicalHealthDays +
PoorMentalHealthDays +
HealthInsurance +
LimitedActivity +
employ +
BMI +
HeavyDrinker +
CurrentSmoker +
PoorSleepDays +
PhysicalActivity +
marital +
PrematureMortality +
ParkAccess +
CrimeRate +
WaterSafety +
SomeCollegeRate,
family = "binomial", df_final)
summary(Model_Int3)
##
## Call:
## glm(formula = wellbeing ~ age + race:GeneralHealth + SocialSupport +
## income + PoorPhysicalHealthDays + PoorMentalHealthDays +
## HealthInsurance + LimitedActivity + employ + BMI + HeavyDrinker +
## CurrentSmoker + PoorSleepDays + PhysicalActivity + marital +
## PrematureMortality + ParkAccess + CrimeRate + WaterSafety +
## SomeCollegeRate, family = "binomial", data = df_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5449 -0.2320 -0.1414 -0.1011 3.5341
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.521e+00 1.491e-01 -10.205 < 2e-16
## age -3.961e-03 9.095e-04 -4.355 1.33e-05
## SocialSupport Sometimes -8.344e-01 3.215e-02 -25.950 < 2e-16
## SocialSupportAlways -2.670e+00 3.619e-02 -73.765 < 2e-16
## SocialSupportNever -7.584e-01 4.076e-02 -18.605 < 2e-16
## SocialSupportUsually -2.019e+00 3.455e-02 -58.440 < 2e-16
## income[25000-35000) -5.381e-02 3.507e-02 -1.534 0.124943
## income[35000-50000) -1.049e-01 3.576e-02 -2.934 0.003346
## income50000 or more -3.104e-01 3.394e-02 -9.146 < 2e-16
## incomeless than 15000 2.220e-02 2.920e-02 0.760 0.447153
## PoorPhysicalHealthDays 2.368e-03 1.196e-03 1.980 0.047759
## PoorMentalHealthDays 6.148e-02 9.243e-04 66.509 < 2e-16
## HealthInsuranceWith Health Coverage -2.408e-01 2.888e-02 -8.338 < 2e-16
## LimitedActivityNot Limited -4.879e-01 2.499e-02 -19.523 < 2e-16
## employRetired -2.205e-02 5.537e-02 -0.398 0.690429
## employSelf-Employed 2.699e-01 6.254e-02 4.316 1.59e-05
## employStudent 2.193e-01 9.309e-02 2.356 0.018479
## employUnable to work 4.548e-01 5.466e-02 8.321 < 2e-16
## employUnemployed for less than 1 yr 9.496e-01 6.288e-02 15.101 < 2e-16
## employUnemployed for more than 1 yr 9.460e-01 5.957e-02 15.879 < 2e-16
## employWaged Employement 1.706e-01 5.172e-02 3.297 0.000976
## BMIObese -2.046e-02 2.565e-02 -0.798 0.425033
## BMIOverweight -7.862e-02 2.566e-02 -3.064 0.002187
## HeavyDrinkerNot a Heavy Drinker -2.875e-01 4.224e-02 -6.806 1.00e-11
## CurrentSmokerNot a Current Smoker -2.273e-01 2.367e-02 -9.602 < 2e-16
## PoorSleepDays 1.685e-02 9.520e-04 17.703 < 2e-16
## PhysicalActivityPhysically Active -1.314e-01 2.203e-02 -5.964 2.46e-09
## maritalNot married 4.783e-01 2.350e-02 20.353 < 2e-16
## PrematureMortality -5.454e-04 1.725e-04 -3.162 0.001567
## ParkAccess 2.274e-03 5.331e-04 4.265 2.00e-05
## CrimeRate 1.240e-04 4.678e-05 2.650 0.008041
## WaterSafety -2.660e-03 7.104e-04 -3.744 0.000181
## SomeCollegeRate 6.708e-03 1.293e-03 5.187 2.14e-07
## raceOther races:GeneralHealthExcellent -1.681e-01 8.260e-02 -2.035 0.041857
## raceWhite:GeneralHealthExcellent -2.538e-01 5.104e-02 -4.972 6.64e-07
## raceOther races:GeneralHealthFair 7.128e-02 5.067e-02 1.407 0.159503
## raceWhite:GeneralHealthFair 5.221e-01 3.860e-02 13.526 < 2e-16
## raceOther races:GeneralHealthGood -1.332e-01 4.895e-02 -2.721 0.006509
## raceWhite:GeneralHealthGood 1.875e-01 3.348e-02 5.600 2.15e-08
## raceOther races:GeneralHealthPoor 5.089e-01 6.153e-02 8.270 < 2e-16
## raceWhite:GeneralHealthPoor 9.607e-01 4.718e-02 20.361 < 2e-16
## raceOther races:GeneralHealthVery good -1.493e-01 6.158e-02 -2.424 0.015345
## raceWhite:GeneralHealthVery good NA NA NA NA
##
## (Intercept) ***
## age ***
## SocialSupport Sometimes ***
## SocialSupportAlways ***
## SocialSupportNever ***
## SocialSupportUsually ***
## income[25000-35000)
## income[35000-50000) **
## income50000 or more ***
## incomeless than 15000
## PoorPhysicalHealthDays *
## PoorMentalHealthDays ***
## HealthInsuranceWith Health Coverage ***
## LimitedActivityNot Limited ***
## employRetired
## employSelf-Employed ***
## employStudent *
## employUnable to work ***
## employUnemployed for less than 1 yr ***
## employUnemployed for more than 1 yr ***
## employWaged Employement ***
## BMIObese
## BMIOverweight **
## HeavyDrinkerNot a Heavy Drinker ***
## CurrentSmokerNot a Current Smoker ***
## PoorSleepDays ***
## PhysicalActivityPhysically Active ***
## maritalNot married ***
## PrematureMortality **
## ParkAccess ***
## CrimeRate **
## WaterSafety ***
## SomeCollegeRate ***
## raceOther races:GeneralHealthExcellent *
## raceWhite:GeneralHealthExcellent ***
## raceOther races:GeneralHealthFair
## raceWhite:GeneralHealthFair ***
## raceOther races:GeneralHealthGood **
## raceWhite:GeneralHealthGood ***
## raceOther races:GeneralHealthPoor ***
## raceWhite:GeneralHealthPoor ***
## raceOther races:GeneralHealthVery good *
## raceWhite:GeneralHealthVery good
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 114891 on 275602 degrees of freedom
## Residual deviance: 73605 on 275561 degrees of freedom
## AIC: 73689
##
## Number of Fisher Scoring iterations: 7
library(broom)
M1 <- glance(model.all)
M2 <- glance(model.1)
M3 <- glance(Model_Int1)
M4 <- glance(Model_Int2)
M5 <- glance(Model_Int3)
model_table <- bind_rows(M1, M2, M3, M4, M5)
datatable(model_table,
rownames=c("All", "Reduced", "race:SocialSupport", "race:income", "race:GeneralHealth"),
options=list(pageLength=5,scrollX=T)) %>%
formatRound(c(1:4:6),digits=2)
## Warning in 1:4:6: numerical expression has 4 elements: only the first used
## set the seed to make your partition reproducible
set.seed(123)
train_index <- sample(seq_len(nrow(df_final)), size = 0.75*nrow(df_final))
train <- df_final[train_index, ]
test <- df_final[-train_index, ]
nrow(train)
## [1] 206702
nrow(test)
## [1] 68901
pacman::p_load(caret)
glm_model <- train(wellbeing ~
age +
SocialSupport +
race +
income +
GeneralHealth +
PoorPhysicalHealthDays +
PoorMentalHealthDays +
HealthInsurance +
LimitedActivity +
employ +
BMI +
HeavyDrinker +
CurrentSmoker +
PoorSleepDays +
PhysicalActivity +
marital +
PrematureMortality +
ParkAccess +
CrimeRate +
WaterSafety +
SomeCollegeRate,
data = train, method = "glm", family = "binomial")
attributes(glm_model)
## $names
## [1] "method" "modelInfo" "modelType" "results" "pred"
## [6] "bestTune" "call" "dots" "metric" "control"
## [11] "finalModel" "preProcess" "trainingData" "ptype" "resample"
## [16] "resampledCM" "perfNames" "maximize" "yLimits" "times"
## [21] "levels" "terms" "coefnames" "contrasts" "xlevels"
##
## $class
## [1] "train" "train.formula"
summary(glm_model)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5314 -0.2309 -0.1414 -0.1017 3.5426
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.864e+00 1.769e-01 -10.537 < 2e-16
## age -4.681e-03 1.049e-03 -4.464 8.05e-06
## `SocialSupport Sometimes` -8.223e-01 3.712e-02 -22.149 < 2e-16
## SocialSupportAlways -2.687e+00 4.184e-02 -64.203 < 2e-16
## SocialSupportNever -7.524e-01 4.716e-02 -15.953 < 2e-16
## SocialSupportUsually -2.053e+00 4.002e-02 -51.307 < 2e-16
## raceWhite 3.656e-01 3.086e-02 11.846 < 2e-16
## `income[25000-35000)` -4.069e-02 4.051e-02 -1.004 0.315258
## `income[35000-50000)` -1.224e-01 4.163e-02 -2.941 0.003276
## `income50000 or more` -2.876e-01 3.914e-02 -7.347 2.03e-13
## `incomeless than 15000` 4.284e-02 3.375e-02 1.269 0.204380
## GeneralHealthFair 6.713e-01 5.504e-02 12.197 < 2e-16
## GeneralHealthGood 3.373e-01 5.069e-02 6.654 2.85e-11
## GeneralHealthPoor 1.093e+00 6.294e-02 17.361 < 2e-16
## `GeneralHealthVery good` 2.197e-01 5.156e-02 4.261 2.03e-05
## PoorPhysicalHealthDays 3.147e-03 1.380e-03 2.281 0.022572
## PoorMentalHealthDays 6.134e-02 1.069e-03 57.399 < 2e-16
## `HealthInsuranceWith Health Coverage` -2.508e-01 3.343e-02 -7.502 6.29e-14
## `LimitedActivityNot Limited` -4.933e-01 2.885e-02 -17.101 < 2e-16
## employRetired -1.542e-02 6.419e-02 -0.240 0.810110
## `employSelf-Employed` 2.687e-01 7.246e-02 3.709 0.000208
## employStudent 2.325e-01 1.082e-01 2.149 0.031619
## `employUnable to work` 4.698e-01 6.344e-02 7.405 1.31e-13
## `employUnemployed for less than 1 yr` 9.377e-01 7.304e-02 12.838 < 2e-16
## `employUnemployed for more than 1 yr` 9.333e-01 6.930e-02 13.467 < 2e-16
## `employWaged Employement` 1.661e-01 6.006e-02 2.766 0.005683
## BMIObese -6.533e-03 2.960e-02 -0.221 0.825346
## BMIOverweight -7.187e-02 2.963e-02 -2.426 0.015287
## `HeavyDrinkerNot a Heavy Drinker` -2.918e-01 4.869e-02 -5.994 2.04e-09
## `CurrentSmokerNot a Current Smoker` -2.524e-01 2.729e-02 -9.246 < 2e-16
## PoorSleepDays 1.612e-02 1.103e-03 14.614 < 2e-16
## `PhysicalActivityPhysically Active` -1.286e-01 2.546e-02 -5.051 4.40e-07
## `maritalNot married` 4.750e-01 2.712e-02 17.513 < 2e-16
## PrematureMortality -7.030e-04 2.002e-04 -3.511 0.000446
## ParkAccess 2.382e-03 6.170e-04 3.861 0.000113
## CrimeRate 1.438e-04 5.419e-05 2.655 0.007941
## WaterSafety -3.075e-03 8.254e-04 -3.726 0.000195
## SomeCollegeRate 5.702e-03 1.497e-03 3.809 0.000140
##
## (Intercept) ***
## age ***
## `SocialSupport Sometimes` ***
## SocialSupportAlways ***
## SocialSupportNever ***
## SocialSupportUsually ***
## raceWhite ***
## `income[25000-35000)`
## `income[35000-50000)` **
## `income50000 or more` ***
## `incomeless than 15000`
## GeneralHealthFair ***
## GeneralHealthGood ***
## GeneralHealthPoor ***
## `GeneralHealthVery good` ***
## PoorPhysicalHealthDays *
## PoorMentalHealthDays ***
## `HealthInsuranceWith Health Coverage` ***
## `LimitedActivityNot Limited` ***
## employRetired
## `employSelf-Employed` ***
## employStudent *
## `employUnable to work` ***
## `employUnemployed for less than 1 yr` ***
## `employUnemployed for more than 1 yr` ***
## `employWaged Employement` **
## BMIObese
## BMIOverweight *
## `HeavyDrinkerNot a Heavy Drinker` ***
## `CurrentSmokerNot a Current Smoker` ***
## PoorSleepDays ***
## `PhysicalActivityPhysically Active` ***
## `maritalNot married` ***
## PrematureMortality ***
## ParkAccess ***
## CrimeRate **
## WaterSafety ***
## SomeCollegeRate ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 86611 on 206701 degrees of freedom
## Residual deviance: 55231 on 206664 degrees of freedom
## AIC: 55307
##
## Number of Fisher Scoring iterations: 7
glm_model$metric
## [1] "Accuracy"
glm_model$results
## parameter Accuracy Kappa AccuracySD KappaSD
## 1 none 0.9517427 0.3641728 0.0006149987 0.007055452
test$pred <- glm_model %>% predict(test)
accuracy_function = function(real, predicted) {
mean(real == predicted)
}
accuracy_function(real = test$wellbeing,
predicted = test$pred)
## [1] 0.9527293
ggplot(varImp(glm_model))
ggplot(varImp(glm_model)) +
geom_bar(stat="identity", fill="#67d1d3", colour="black") +
ggtitle("Impacts of Features on Wellbeing") +
labs(y= "Relative Impacts of Features", x= "Features") +
theme_bw() +
theme(legend.position="none",
axis.text.y = element_text( size=12, face="bold")) +
expand_limits(y=0.85)
## Performance Metrics for Log Reg ML Model
confusionMtx_logreg = table(test$pred,test$wellbeing)
confusionMtx_logreg
##
## Good_Wellbeing Poor_Wellbeing
## Good_Wellbeing 64619 2578
## Poor_Wellbeing 679 1025
TP_logreg=confusionMtx_logreg[1,1]
FP_logreg=confusionMtx_logreg[1,2]
TN_logreg=confusionMtx_logreg[2,2]
FN_logreg=confusionMtx_logreg[2,1]
TP_logreg
## [1] 64619
recall_logreg=TP_logreg/(TP_logreg+FN_logreg)
recall_logreg
## [1] 0.9896015
Accuracy_logreg=(TP_logreg+TN_logreg)/nrow(test)
Accuracy_logreg
## [1] 0.9527293
specificity_logreg=TN_logreg/(TN_logreg+FP_logreg)
specificity_logreg
## [1] 0.2844852
precision_logreg=TP_logreg/(TP_logreg+FP_logreg)
precision_logreg
## [1] 0.9616352
F1_logreg=(2*recall_logreg*precision_logreg)/(recall_logreg+precision_logreg)
F1_logreg
## [1] 0.9754179
# retrieve probabilities
prob_logreg<- glm_model %>% predict(test,type='prob')
str(prob_logreg)
## 'data.frame': 68901 obs. of 2 variables:
## $ Good_Wellbeing: num 0.976 0.97 0.997 0.994 0.984 ...
## $ Poor_Wellbeing: num 0.02428 0.03014 0.00299 0.00625 0.01555 ...
logreg.roc=roc(predictor=prob_logreg$Good_Wellbeing,response=test$wellbeing,
levels=levels(test$wellbeing))
## Setting direction: controls > cases
logreg.roc
##
## Call:
## roc.default(response = test$wellbeing, predictor = prob_logreg$Good_Wellbeing, levels = levels(test$wellbeing))
##
## Data: prob_logreg$Good_Wellbeing in 65298 controls (test$wellbeing Good_Wellbeing) > 3603 cases (test$wellbeing Poor_Wellbeing).
## Area under the curve: 0.9066
ggroc(logreg.roc, alpha = 1, colour = "red", linetype = 1, size =0.7 ) +
geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color="black", linetype="dashed") +
ggtitle("ROC Curve for Logistic Regression ML Model") +
xlab("1 - specificity") +
ylab("sensitivity") +
annotate(geom="text", x=0.7, y=0.75, label="Log Reg Model classifier", color="red") +
annotate(geom="text", x=0.45, y=0.4, label="random classifier") +
annotate(geom="text", x=0.1, y=0.05, label="AUC = 0.9066", color="darkred") +
theme_bw()
library(rattle)
library(pROC)
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:rattle':
##
## xgboost
## The following object is masked from 'package:dplyr':
##
## slice
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
##
## importance
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:psych':
##
## outlier
# using same partitioned dataset as before
# set.seed(123)
# train_index <- sample(seq_len(nrow(df_final)), size = 0.75*nrow(df_final))
#
# train <- df_final[train_index, ]
# test <- df_final[-train_index, ]
# cat(nrow(test),nrow(train))
levels(train$wellbeing) <- c("Good_Wellbeing", "Poor_Wellbeing")
trcntrl_ho<-trainControl(classProbs = TRUE,summaryFunction = twoClassSummary)
model_CART <- train(wellbeing ~
age +
SocialSupport +
race +
income +
GeneralHealth +
PoorPhysicalHealthDays +
PoorMentalHealthDays +
HealthInsurance +
LimitedActivity +
employ +
BMI +
HeavyDrinker +
CurrentSmoker +
PoorSleepDays +
PhysicalActivity +
marital +
PrematureMortality +
ParkAccess +
CrimeRate +
WaterSafety +
SomeCollegeRate ,
data = train, method = "rpart",trControl=trcntrl_ho,metric="ROC",tuneLength = 10)
model_CART$results
## cp ROC Sens Spec ROCSD SensSD
## 1 0.0006746424 0.7446683 0.9915859 0.2329208 0.016625048 0.0007384287
## 2 0.0007196186 0.7448023 0.9917384 0.2318861 0.016638739 0.0007102355
## 3 0.0007645948 0.7415792 0.9917300 0.2328093 0.004182825 0.0007047197
## 4 0.0009594915 0.7417753 0.9919693 0.2319710 0.004188718 0.0005169431
## 5 0.0009894756 0.7417870 0.9919665 0.2323544 0.004186428 0.0005654889
## 6 0.0013492849 0.7418750 0.9919461 0.2347920 0.004203448 0.0007907361
## 7 0.0029234506 0.7418689 0.9920702 0.2304524 0.004170225 0.0010178245
## 8 0.0031483314 0.7418494 0.9921898 0.2281074 0.004159639 0.0010638153
## 9 0.0140325627 0.7315662 0.9930982 0.1917173 0.048424555 0.0023999118
## 10 0.0174057749 0.6244066 0.9956310 0.1109419 0.122020455 0.0043656988
## SpecSD
## 1 0.01152684
## 2 0.01079265
## 3 0.01170444
## 4 0.01119856
## 5 0.01160028
## 6 0.01430872
## 7 0.01787513
## 8 0.01888900
## 9 0.04803669
## 10 0.10942878
plot(model_CART)
ggplot(varImp(model_CART))
ggplot(varImp(model_CART)) +
geom_bar(stat="identity", fill="#FBE1B3", colour="black") +
ggtitle("Impacts of Features on Wellbeing") +
labs(y= "Relative Impacts of Features", x= "Features") +
theme_bw() +
theme(legend.position="none",
axis.text.y = element_text( size=12, face="bold")) +
expand_limits(y=0.85)
print(varImp(model_CART))
## rpart variable importance
##
## only 20 most important variables shown (out of 46)
##
## Overall
## PoorMentalHealthDays 100.0000
## LimitedActivityNot Limited 67.1050
## GeneralHealthPoor 64.3569
## employUnable to work 61.1772
## PoorPhysicalHealthDays 44.3915
## SocialSupportUsually 22.5720
## SocialSupportAlways 18.8065
## maritalNot married 12.5655
## SocialSupport Sometimes 8.0914
## age 3.8893
## CurrentSmokerNot a Current Smoker 2.1379
## employRetired 2.0181
## raceWhite 1.8253
## PrematureMortality 1.2543
## ParkAccess 0.9275
## SomeCollegeRate 0.8861
## PoorSleepDays 0.8672
## employUnemployed for less than 1 yr 0.8315
## employUnemployed for more than 1 yr 0.8118
## GeneralHealthGood 0.7240
plot(model_CART$finalModel, uniform=TRUE,
main="Classification Tree")
text(model_CART$finalModel, all=FALSE, cex=.7)
# Using the Rattle Library
library(rattle)
fancyRpartPlot(model_CART$finalModel,cex = 0.9)
test$pred_cart <- model_CART %>% predict(test)
prob_cart<- model_CART %>% predict(test,type='prob')
confusionMtx=table(test$pred_cart,test$wellbeing)
confusionMtx
##
## Good_Wellbeing Poor_Wellbeing
## Good_Wellbeing 64787 2798
## Poor_Wellbeing 511 805
TP=confusionMtx[1,1]
FP=confusionMtx[1,2]
TN=confusionMtx[2,2]
FN=confusionMtx[2,1]
TP
## [1] 64787
recall=TP/(TP+FN)
recall
## [1] 0.9921743
Accuracy=(TP+TN)/nrow(test)
Accuracy
## [1] 0.9519746
specificity=TN/(TN+FP)
specificity
## [1] 0.2234249
precision=TP/(TP+FP)
precision
## [1] 0.9586003
F1=(2*recall*precision)/(recall+precision)
F1
## [1] 0.9750984
str(prob_cart)
## 'data.frame': 68901 obs. of 2 variables:
## $ Good_Wellbeing: num 0.972 0.972 0.972 0.972 0.972 ...
## $ Poor_Wellbeing: num 0.0276 0.0276 0.0276 0.0276 0.0276 ...
CART.roc=roc(predictor=prob_cart$Good_Wellbeing,response=test$wellbeing,
levels=levels(test$wellbeing))
## Setting direction: controls > cases
CART.roc
##
## Call:
## roc.default(response = test$wellbeing, predictor = prob_cart$Good_Wellbeing, levels = levels(test$wellbeing))
##
## Data: prob_cart$Good_Wellbeing in 65298 controls (test$wellbeing Good_Wellbeing) > 3603 cases (test$wellbeing Poor_Wellbeing).
## Area under the curve: 0.7389
ggroc(CART.roc, alpha = 1, colour = "red", linetype = 1, size =0.7 ) +
geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color="black", linetype="dashed") +
ggtitle("ROC Curve for CART Model") +
xlab("1 - specificity") +
ylab("sensitivity") +
annotate(geom="text", x=0.7, y=0.75, label="CART Model classifier", color="red") +
annotate(geom="text", x=0.45, y=0.4, label="random classifier") +
annotate(geom="text", x=0.1, y=0.05, label="AUC = 0.7388", color="darkred") +
theme_bw()